예제 #1
0
class ZookeeperWatcher():
    zoo_client = None  # The KazooClient to manage the config
    point_path = None  # Zookeeper path to pointed to file
    pointed_at_expired = None  # is True when the assignment has been set to
    # None but we cannot remove the config listener
    valid_handler = None  # the function to call when the validity changes
    config_handler = None  # the function to call when the config changes
    error_handler = None  # the function to call when an error occurs in reading
    valid_file = False  # the current state of the ConfigWatcher with ZK
    do_not_restart = False  # used when closing via ^C
    old_data = ''  # The current file contents, to see if a change occurred
    old_pointed = ''  # the current pointed path, to see if change occurred

    INVALID_PATH = "Invalid pointer path"
    INVALID_GET = "Invalid get on file path"
    BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing"

    def __init__(self,
                 hosts,
                 filepath,
                 valid_handler=None,
                 config_handler=None,
                 error_handler=None,
                 pointer=False,
                 ensure=False,
                 valid_init=True):
        '''
        Zookeeper file watcher, used to tell a program their zookeeper file has
        changed. Can be used to watch a single file, or both a file and path of
        its contents. Manages all connections, drops, reconnections for you.

        @param hosts: The zookeeper hosts to use
        @param filepath: The full path to the file to watch
        @param valid_handler: The method to call for a 'is valid' state change
        @param config_handler: The method to call when a content change occurs
        @param error_handler: The method to call when an error occurs
        @param pointer: Set to true if the file contents are actually a path to
                        another zookeeper file, where the real config resides
        @param ensure: Set to true for the ZooWatcher to create the watched file
        @param valid_init: Ensure the client can connect to Zookeeper first try

        Ex 1. /stuff/A: "stuff I care about"
        Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about"
            - in Ex 2 you care about /other/stuff contents
              but are only aware of your assignment /stuff/A

        You can use this class as any combination of event driven or polling.
        Polling:
            In the main loop of your program, check if is_valid() is
            True, otherwise clear your contents as there is some ZK error.
        Event:
            You will be notified via the various handlers when content changes.
        '''
        self.hosts = hosts
        self.my_file = filepath
        self.pointer = pointer
        self.ensure = ensure
        self.valid_handler = valid_handler
        self.config_handler = config_handler
        self.error_handler = error_handler

        if valid_init:
            # this will throw an exception if it can't start right away
            self.zoo_client = KazooClient(hosts=self.hosts)
            self.zoo_client.start()

        self.threaded_start(no_init=True)

    def threaded_start(self, no_init=False):
        '''
        Spawns a worker thread to set up the zookeeper connection
        '''
        thread = Thread(target=self.init_connections,
                        kwargs={'no_init': no_init})
        thread.setDaemon(True)
        thread.start()
        thread.join()

    def init_connections(self, no_init=False):
        '''
        Sets up the initial Kazoo Client and watches
        '''
        success = False
        self.set_valid(False)

        if not no_init:
            if self.zoo_client:
                self.zoo_client.remove_listener(self.state_listener)
                self.old_data = ''
                self.old_pointed = ''

            while not success:
                try:
                    if self.zoo_client is None:
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                    else:
                        # self.zoo_client.stop()
                        self.zoo_client._connection.connection_stopped.set()
                        self.zoo_client.close()
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                except Exception, e:
                    print "ZKWatcher Exception:", e
                    sleep(1)
                    continue

                self.setup()
                success = self.update_file(self.my_file)
                sleep(5)
        else:
예제 #2
0
class ZookeeperWatcher(object):
    zoo_client = None  # The KazooClient to manage the config
    point_path = None  # Zookeeper path to pointed to file
    pointed_at_expired = None  # is True when the assignment has been set to
    # None but we cannot remove the config listener
    valid_handler = None  # the function to call when the validity changes
    config_handler = None  # the function to call when the config changes
    error_handler = None  # the function to call when an error occurs in reading
    valid_file = False  # the current state of the ConfigWatcher with ZK
    do_not_restart = False  # used when closing via ^C
    old_data = ''  # The current file contents, to see if a change occurred
    old_pointed = ''  # the current pointed path, to see if change occurred

    INVALID_PATH = "Invalid pointer path"
    INVALID_GET = "Invalid get on file path"
    BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing"

    def __init__(self,
                 hosts,
                 filepath,
                 valid_handler=None,
                 config_handler=None,
                 error_handler=None,
                 pointer=False,
                 ensure=False,
                 valid_init=True):
        '''
        Zookeeper file watcher, used to tell a program their zookeeper file has
        changed. Can be used to watch a single file, or both a file and path of
        its contents. Manages all connections, drops, reconnections for you.

        @param hosts: The zookeeper hosts to use
        @param filepath: The full path to the file to watch
        @param valid_handler: The method to call for a 'is valid' state change
        @param config_handler: The method to call when a content change occurs
        @param error_handler: The method to call when an error occurs
        @param pointer: Set to true if the file contents are actually a path to
                        another zookeeper file, where the real config resides
        @param ensure: Set to true for the ZooWatcher to create the watched file
        @param valid_init: Ensure the client can connect to Zookeeper first try

        Ex 1. /stuff/A: "stuff I care about"
        Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about"
            - in Ex 2 you care about /other/stuff contents
              but are only aware of your assignment /stuff/A

        You can use this class as any combination of event driven or polling.
        Polling:
            In the main loop of your program, check if is_valid() is
            True, otherwise clear your contents as there is some ZK error.
        Event:
            You will be notified via the various handlers when content changes.
        '''
        self.hosts = hosts
        self.my_file = filepath
        self.pointer = pointer
        self.ensure = ensure
        self.valid_handler = valid_handler
        self.config_handler = config_handler
        self.error_handler = error_handler

        if valid_init:
            # this will throw an exception if it can't start right away
            self.zoo_client = KazooClient(hosts=self.hosts)
            self.zoo_client.start()

        self.threaded_start(no_init=True)

    def threaded_start(self, no_init=False):
        '''
        Spawns a worker thread to set up the zookeeper connection
        '''
        thread = Thread(target=self.init_connections,
                        kwargs={'no_init': no_init})
        thread.setDaemon(True)
        thread.start()
        thread.join()

    def init_connections(self, no_init=False):
        '''
        Sets up the initial Kazoo Client and watches
        '''
        success = False
        self.set_valid(False)

        if not no_init:
            if self.zoo_client:
                self.zoo_client.remove_listener(self.state_listener)
                self.old_data = ''
                self.old_pointed = ''

            while not success:
                try:
                    if self.zoo_client is None:
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                    else:
                        # self.zoo_client.stop()
                        self.zoo_client._connection.connection_stopped.set()
                        self.zoo_client.close()
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                except Exception as e:
                    log.error("ZKWatcher Exception: " + e.message)
                    sleep(1)
                    continue

                self.setup()
                success = self.update_file(self.my_file)
                sleep(5)
        else:
            self.setup()
            self.update_file(self.my_file)

    def setup(self):
        '''
        Ensures the path to the watched file exists and we have a state
        listener
        '''
        self.zoo_client.add_listener(self.state_listener)

        if self.ensure:
            self.zoo_client.ensure_path(self.my_file)

    def state_listener(self, state):
        '''
        Restarts the session if we get anything besides CONNECTED
        '''
        if state == KazooState.SUSPENDED:
            self.set_valid(False)
            self.call_error(self.BAD_CONNECTION)
        elif state == KazooState.LOST and not self.do_not_restart:
            self.threaded_start()
        elif state == KazooState.CONNECTED:
            # This is going to throw a SUSPENDED kazoo error
            # which will cause the sessions to be wiped and re established.
            # Used b/c of massive connection pool issues
            self.zoo_client.stop()

    def is_valid(self):
        '''
        @return: True if the currently watch file is valid
        '''
        return self.valid_file

    def ping(self):
        '''
        Simple command to test if the zookeeper session is able to connect
        at this very moment
        '''
        try:
            # dummy ping to ensure we are still connected
            self.zoo_client.server_version()
            return True
        except KazooException:
            return False

    def close(self, kill_restart=True):
        '''
        Use when you would like to close everything down
        @param kill_restart= Prevent kazoo restarting from occurring
        '''
        self.do_not_restart = kill_restart
        self.zoo_client.stop()
        self.zoo_client.close()

    def get_file_contents(self, pointer=False):
        '''
        Gets any file contents you care about. Defaults to the main file
        @param pointer: The the contents of the file pointer, not the pointed
        at file
        @return: A string of the contents
        '''
        if self.pointer:
            if pointer:
                return self.old_pointed
            else:
                return self.old_data
        else:
            return self.old_data

    def watch_file(self, event):
        '''
        Fired when changes made to the file
        '''
        if not self.update_file(self.my_file):
            self.threaded_start()

    def update_file(self, path):
        '''
        Updates the file watcher and calls the appropriate method for results
        @return: False if we need to keep trying the connection
        '''
        try:
            # grab the file
            result, stat = self.zoo_client.get(path, watch=self.watch_file)
        except ZookeeperError:
            self.set_valid(False)
            self.call_error(self.INVALID_GET)
            return False

        if self.pointer:
            if result is not None and len(result) > 0:
                self.pointed_at_expired = False
                # file is a pointer, go update and watch other file
                self.point_path = result
                if self.compare_pointer(result):
                    self.update_pointed()
            else:
                self.pointed_at_expired = True
                self.old_pointed = ''
                self.old_data = ''
                self.set_valid(False)
                self.call_error(self.INVALID_PATH)
        else:
            # file is not a pointer, return contents
            if self.compare_data(result):
                self.call_config(result)
            self.set_valid(True)

        return True

    def watch_pointed(self, event):
        '''
        Fired when changes made to pointed file
        '''
        self.update_pointed()

    def update_pointed(self):
        '''
        Grabs the latest file contents based on the pointer uri
        '''
        # only grab file if our pointer is still good (not None)
        if not self.pointed_at_expired:
            try:
                conf_string, stat2 = self.zoo_client.get(
                    self.point_path, watch=self.watch_pointed)
            except ZookeeperError:
                self.old_data = ''
                self.set_valid(False)
                self.pointed_at_expired = True
                self.call_error(self.INVALID_PATH)
                return

            if self.compare_data(conf_string):
                self.call_config(conf_string)
            self.set_valid(True)

    def set_valid(self, boolean):
        '''
        Sets the state and calls the change if needed
        @param bool: The state (true or false)
        '''
        old_state = self.is_valid()
        self.valid_file = boolean

        if old_state != self.valid_file:
            self.call_valid(self.valid_file)

    def call_valid(self, state):
        '''
        Calls the valid change function passed in
        @param valid_state: The new config
        '''
        if self.valid_handler is not None:
            self.valid_handler(self.is_valid())

    def call_config(self, new_config):
        '''
        Calls the config function passed in
        @param new_config: The new config
        '''
        if self.config_handler is not None:
            self.config_handler(new_config)

    def call_error(self, message):
        '''
        Calls the error function passed in
        @param message: The message to throw
        '''
        if self.error_handler is not None:
            self.error_handler(message)

    def compare_data(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_data != data:
            self.old_data = data
            return True
        return False

    def compare_pointer(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_pointed != data:
            self.old_pointed = data
            return True
        return False
예제 #3
0
class PartitionClient(object):
    """ Client Class for the Partition Library
    Example usage:
    ---------------------
    import libpartition
    from libpartition.libpartition import PartitionClient

    def own_change_cb(l):
            print "ownership change:" + str(l)

    c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, 
            own_change_cb, "zookeeper_s1")

    ##do some real work now"
    if (c.own_partition(1)):
        ...... do something with partition #1 .....
        .........
    ...
    c.update_cluster_list(["s1", "s2"])
    ...
    ----------------------
    You should not call any partition library routine from within the 
    callback function

    Args:
        app_name(str): Name of the app for which partition cluster is used
        self_name(str): Name of the local cluster node (can be ip address)
        cluster_list(list): List of all the nodes in the cluster including 
            local node
        max_partition(int): Partition space always go from 0..max_partition-1
        partition_update_cb: Callback function invoked when partition
            ownership list is updated.x
        zk_server(str): <zookeeper server>:<zookeeper server port>
    """
    def __init__(self,
                 app_name,
                 self_name,
                 cluster_list,
                 max_partition,
                 partition_update_cb,
                 zk_server,
                 logger=None):

        # Initialize local variables
        self._zk_server = zk_server
        self._cluster_list = set(cluster_list)
        self._max_partition = max_partition
        self._update_cb = partition_update_cb
        self._curr_part_ownership_list = []
        self._target_part_ownership_list = []
        self._con_hash = ConsistentHash(cluster_list)
        self._name = self_name

        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        # connect to zookeeper
        while True:
            self._logger.error("Libpartition zk start")
            self._zk = KazooClient(zk_server, timeout=60.0)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        # create a lock array to contain locks for each partition
        self._part_locks = []
        for part in range(0, self._max_partition):
            lockpath = "/lockpath/" + app_name + "/" + str(part)
            l = self._zk.Lock(lockpath, self._name)
            self._part_locks.append(l)

        # initialize partition # to lock acquire greenlet dictionary
        self._part_lock_task_dict = {}

        self._logger.error("initial servers:" + str(self._cluster_list))

        # update target partition ownership list
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end __init__

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name='Zookeeper',
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Libpartition listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Libpartition connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Libpartition connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    # following routine is the greenlet task function to acquire the lock
    # for a partition
    def _acquire_lock(self, part):
        # lock for the partition
        l = self._part_locks[part]

        # go in an infinite loop waiting to acquire the lock
        try:
            while True:
                ret = l.acquire(blocking=False)
                if ret == True:
                    self._logger.error("Acquired lock for:" + str(part))
                    self._curr_part_ownership_list.append(part)
                    self._update_cb(self._curr_part_ownership_list)
                    return True
                else:
                    gevent.sleep(1)
        except CancelledError:
            self._logger.error("Lock acquire cancelled for:" + str(part))
            return False
        except Exception as ex:
            # TODO: If we have a non-KazooException, the lock object
            #       may get stuck in the "cancelled" state
            self._logger.error("Lock acquire unexpected error!: " + str(ex))
            # This exception should get propogated to main thread
            raise SystemExit(1)
            return False

    #end _acquire_lock

    # get rid of finished spawned tasks from datastructures
    def _cleanup_greenlets(self):
        for part in list(self._part_lock_task_dict.keys()):
            if (self._part_lock_task_dict[part].ready()):
                del self._part_lock_task_dict[part]

    #end _cleanup_greenlets

    # following routine launches tasks to acquire partition locks
    def _acquire_partition_ownership(self):
        # cleanup any finished greenlets
        self._cleanup_greenlets()

        # this variable will help us decide if we need to call callback
        updated_curr_ownership = False

        # list of partitions for which locks have to be released
        release_lock_list = []

        self._logger.info("known servers: %s" % self._con_hash.get_all_nodes())

        for part in range(0, self._max_partition):
            if (part in self._target_part_ownership_list):
                if (part in self._curr_part_ownership_list):
                    # do nothing, I already have ownership of this partition
                    self._logger.info("No need to acquire ownership of:" +
                                      str(part))
                else:
                    # I need to acquire lock for this partition before I own
                    if (part in list(self._part_lock_task_dict.keys())):
                        try:
                            self._part_lock_task_dict[part].get(block=False)
                        except:
                            # do nothing there is already a greenlet running to
                            # acquire the lock
                            self._logger.error("Already a greenlet running to"
                                               " acquire:" + str(part))
                            continue

                        # Greenlet died without getting ownership. Cleanup
                        self._logger.error("Cleanup stale greenlet running to"
                                           " acquire:" + str(part))
                        del self._part_lock_task_dict[part]

                    self._logger.error("Starting greenlet running to"
                                       " acquire:" + str(part))
                    # launch the greenlet to acquire the loc, k
                    g = Greenlet.spawn(self._acquire_lock, part)
                    self._part_lock_task_dict[part] = g

            else:
                # give up ownership of the partition

                # cancel any lock acquisition which is ongoing
                if (part in list(self._part_lock_task_dict.keys())):
                    try:
                        self._part_lock_task_dict[part].get(block=False)
                    except:

                        self._logger.error(
                            "canceling lock acquisition going on \
                            for:" + str(part))
                        # Cancelling the lock should result in killing the gevent
                        self._part_locks[part].cancel()
                        self._part_lock_task_dict[part].get(block=True)

                    del self._part_lock_task_dict[part]

                if (part in self._curr_part_ownership_list):
                    release_lock_list.append(part)
                    self._curr_part_ownership_list.remove(part)
                    updated_curr_ownership = True
                    self._logger.error("giving up ownership of:" + str(part))

        if (updated_curr_ownership is True):
            # current partition membership was updated call the callback
            self._update_cb(self._curr_part_ownership_list)

        if (len(release_lock_list) != 0):
            # release locks which were acquired
            for part in release_lock_list:
                self._logger.error("release the lock which was acquired:" + \
                        str(part))
                try:
                    self._part_locks[part].release()
                    self._logger.error("fully gave up ownership of:" +
                                       str(part))
                except:
                    pass

    #end _acquire_partition_ownership

    def update_cluster_list(self, cluster_list):
        """ Updates the cluster node list
        Args:
            cluster_list(list): New list of names of the nodes in 
                the cluster
        Returns:
            None
        """
        # some sanity check
        if not (self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        new_cluster_list = set(cluster_list)
        new_servers = list(new_cluster_list.difference(self._cluster_list))
        deleted_servers = list(
            set(self._cluster_list).difference(new_cluster_list))
        self._cluster_list = set(cluster_list)

        # update the hash structure
        if new_servers:
            self._logger.error("new servers:" + str(new_servers))
            self._con_hash.add_nodes(new_servers)
        if deleted_servers:
            self._logger.error("deleted servers:" + str(deleted_servers))
            self._con_hash.del_nodes(deleted_servers)

        # update target partition ownership list
        self._target_part_ownership_list = []
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                if not (part in self._target_part_ownership_list):
                    self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end update_cluster_list

    def own_partition(self, part_no):
        """ Returns ownership information of a partition
        Args:
            part_no(int) : Partition no 
        Returns:
            True if partition is owned by the local node
            False if partition is not owned by the local node
        """
        return part_no in self._curr_part_ownership_list

    #end own_partition

    def close(self):
        """ Closes any connections and frees up any data structures
        Args:
        Returns:
            None
        """
        # clean up greenlets
        for part in list(self._part_lock_task_dict.keys()):
            try:
                self._logger.error("libpartition greenlet cleanup %s" %
                                   str(part))
                self._part_lock_task_dict[part].kill()
            except:
                pass

        self._zk.remove_listener(self._zk_listen)
        gevent.sleep(1)
        self._logger.error("Stopping libpartition")
        # close zookeeper
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping libpartition failed")
        else:
            self._logger.error("Stopping libpartition successful")

        self._logger.error("Closing libpartition")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing libpartition failed")
        else:
            self._logger.error("Closing libpartition successful")
예제 #4
0
   gargle.add_argument('--region', metavar="<aws-region-spec>",
		      help='AWS region where the snapshots are stored (default: region of host instance)')

   gargle.add_argument('--snaps', action='store_true',
		      help='check snapshots from most recent backup (default: False)')

   args = gargle.parse_args()

   try:
      y = yaml.safe_load(open(args.yaml))
      servers = ','.join("%s:%s" % (s['host'],s['port']) for s in y['zookeepers'])


      zk = KazooClient(hosts=servers)
      zk.start()
      zk.add_listener(state_listener)

      if args.snaps:
         status = look4snaps(zk)
      else:
         status = look4abort(zk)

      zk.remove_listener(state_listener)
      zk.stop()

      status.exit()

   except Exception as e:
      UNKNOWN("Error: %s" % e).exit()

class ConsistentScheduler(object):
    '''
        LibPartitionHelper abstract out workers and work_items, and their
        mapping to partitions. So application can only deal with the work
        items it owns, without bothering about partition mapping.

        This class also provides syncronization premitives to ensure apps
        to clean up b4 giving up their partitions
    '''
    _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9)

    def __init__(self, service_name=None, zookeeper='127.0.0.1:2181',
                 delete_hndlr=None, add_hndlr=None, bucketsize=47,
                 item2part_func=None, partitioner=None, logger=None, 
                 cluster_id=''):
        if logger:
            self._logger = logger
        else:
            self._logger = logging.getLogger(__name__)
        self._service_name = service_name or os.path.basename(sys.argv[0])
        self._item2part_func = item2part_func or self._device2partition
        self._zookeeper_srvr = zookeeper
        self._zk = None
        self._bucketsize = bucketsize
        self._delete_hndlr = delete_hndlr
        self._add_hndlr = add_hndlr
        self._partitioner = partitioner or self._partitioner_func
        self._partitions = {}
        self._con_hash = None
        self._last_log = ''
        self._last_log_cnt = 0
        self._partition_set = map(str, range(self._bucketsize))

        self._cluster_id = cluster_id
        if self._cluster_id:
            self._zk_path = '/'+self._cluster_id + '/contrail_cs' + '/'+self._service_name
        else:
            self._zk_path = '/'.join(['/contrail_cs', self._service_name])
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        while True:
            self._logger.error("Consistent scheduler zk start")
            self._zk = KazooClient(self._zookeeper_srvr,
                handler=SequentialGeventHandler())
            self._zk.add_listener(self._zk_lstnr)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_lstnr)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Consistent scheduler zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._service_name))
                finally:
                    self._zk = None
                gevent.sleep(1)
        self._pc = self._zk.SetPartitioner(path=self._zk_path,
                                           set=self._partition_set,
                                           partition_func=self._partitioner)
        self._wait_allocation = 0
        gevent.sleep(0)

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER,
                name = 'Zookeeper', status = new_conn_state,
                message = message,
                server_addrs = self._zookeeper_srvr.split(','))

        if ((self._conn_state and self._conn_state != ConnectionStatus.DOWN) and
            new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' %(message)
            self._supress_log(msg)
        if (self._conn_state and self._conn_state != new_conn_state and
            new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._supress_log(msg)

        self._conn_state = new_conn_state
    # end _sandesh_connection_info_update

    def _zk_lstnr(self, state):
        self._logger.error("Consistent scheduler listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Consistent scheduler connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all 
            # over again
            self._sandesh_connection_info_update(status='DOWN',
                                      message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Consistent scheduler connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(status='INIT',
                message = 'Connection to zookeeper lost. Retrying')

    def schedule(self, items, lock_timeout=30):
        gevent.sleep(0)
        ret = False
        if self._pc.failed:
            self._logger.error('Lost or unable to acquire partition')
            os._exit(2)
        elif self._pc.release:
            self._supress_log('Releasing...')
            self._release()
        elif self._pc.allocating:
            self._supress_log('Waiting for allocation...')
            self._pc.wait_for_acquire(lock_timeout)
            if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION:
                self._wait_allocation += 1
            else:
                self._logger.error('Giving up after %d tries!' %
                    (self._wait_allocation))
                os._exit(2)
        elif self._pc.acquired:
            self._supress_log('got work: ', list(self._pc))
            ret = True
            self._wait_allocation = 0
            self._populate_work_items(items)
            self._supress_log('work items: ',
                              self._items2name(self.work_items()),
                              'from the list',
                              self._items2name(items))
        return ret

    def members(self):
        return list(self._con_hash.nodes)

    def partitions(self):
        return list(self._pc)

    def work_items(self):
        return sum(self._partitions.values(), [])

    def finish(self):
        self._inform_delete(self._partitions.keys())
        self._pc.finish()
        self._zk.remove_listener(self._zk_lstnr)
        gevent.sleep(1)
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping kazooclient failed")
        else:
            self._logger.error("Stopping kazooclient successful")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing kazooclient failed")
        else:
            self._logger.error("Closing kazooclient successful")

    def _items2name(self, items):
        return map(lambda x: x.name, items)

    def _supress_log(self, *s):
        slog = ' '.join(map(str, s))
        dl = ''
        if slog != self._last_log_cnt:
            if self._last_log_cnt:
                dl += ' ' * 4
                dl += '.' * 8
                dl += '[last print repeats %d times]' % self._last_log_cnt
                self._last_log_cnt = 0
            dl += slog
            self._last_log = slog
            self._logger.debug(dl)
        else:
            self._last_log_cnt += 1

    def _consistent_hash(self, members):
        if self._con_hash is None:
            self._con_hash = ConsistentHash(members)
            self._logger.error('members: %s' % (str(self._con_hash.nodes)))
        cur, updtd = set(self._con_hash.nodes), set(members)
        if cur != updtd:
            newm = updtd - cur
            rmvd = cur - updtd
            if newm:
                self._logger.error('new members: %s' % (str(newm)))
                self._con_hash.add_nodes(list(newm))
            if rmvd:
                self._logger.error('members left: %s' % (str(rmvd)))
                self._con_hash.del_nodes(list(rmvd))
        return self._con_hash

    def _consistent_hash_get_node(self, members, partition):
        return self._consistent_hash(members).get_node(partition)

    def _partitioner_func(self, identifier, members, _partitions):
        partitions = [p for p in _partitions \
            if self._consistent_hash_get_node(members, p) == identifier]
        self._logger.error('partitions: %s' % (str(partitions)))
        return partitions

    def _release(self):
        old = set(self._pc)
        new = set(self._partitioner(self._pc._identifier,
                                   list(self._pc._party),
                                   self._partition_set))
        rmvd = old - new
        added = new - old
        if rmvd:
            self._inform_delete(list(rmvd))
        if added:
            self._inform_will_add(list(added))
        self._pc.release_set()

    def _list_items_in(self, partitions):
        return sum([self._partitions[k] for k in partitions if k in \
                    self._partitions], [])

    def _inform_will_add(self, partitions):
        if callable(self._add_hndlr):
            self._add_hndlr(self._list_items_in(partitions))

    def _inform_delete(self, partitions):
        if callable(self._delete_hndlr):
            self._delete_hndlr(self._list_items_in(partitions))

    def _populate_work_items(self, items):
        self._refresh_work_items()
        for i in items:
            part = str(self._item2part_func(i.name))
            if part in list(self._pc):
                if part not in self._partitions:
                    self._partitions[part] = []
                if i.name not in map(lambda x: x.name,
                                     self._partitions[part]):
                    self._partitions[part].append(i)
        self._logger.debug('@populate_work_items(%s): done!' % ' '.join(
                map(lambda v: str(v[0]) + ':' + ','.join(map(
                        lambda x: x.name, v[1])), self._partitions.items())))
        gevent.sleep(0)

    def _device2partition(self, key):
        return struct.unpack('Q', hashlib.md5(key).digest(
                    )[-8:])[0] % self._bucketsize

    def _refresh_work_items(self):
        for k in self._partitions:
            self._partitions[k] = []
예제 #6
0
class AnalyticsDiscovery(gevent.Greenlet):
    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name=self._svc_name,
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            self._sandesh_connection_info_update(status='UP', message='')
            self._logger.error("Analytics Discovery to publish %s" %
                               str(self._pubinfo))
            self._reconnect = True
        elif state == KazooState.LOST:
            self._logger.error("Analytics Discovery connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Analytics Discovery connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(
                sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self,
                 logger,
                 zkservers,
                 svc_name,
                 inst,
                 watchers={},
                 zpostfix="",
                 freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zkservers = zkservers
        self._zk = None
        self._pubinfo = None
        self._publock = Semaphore()
        self._watchers = watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):

        # This function can be called concurrently by the main AlarmDiscovery
        # processing loop as well as by clients.
        # It is NOT re-entrant
        self._publock.acquire()

        self._pubinfo = pubinfo
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" %
                                   (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" %
                                   (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")
        self._publock.release()

    def _run(self):
        while True:
            self._logger.error("Analytics Discovery zk start")
            self._zk = KazooClient(hosts=self._zkservers)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._svc_name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                                       partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._watchers[wk]:
                                self._watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath +
                                                             "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(
                                    self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.remove_listener(self._zk_listen)
                    gevent.sleep(1)
                    try:
                        self._zk.stop()
                    except:
                        self._logger.error("Stopping kazooclient failed")
                    else:
                        self._logger.error("Stopping kazooclient successful")
                    try:
                        self._zk.close()
                    except:
                        self._logger.error("Closing kazooclient failed")
                    else:
                        self._logger.error("Closing kazooclient successful")
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
예제 #7
0
class ClusterZookeeper(object):
    def __init__(self, zookeeper_hosts, kafka_hosts):
        self.groups_dict = {}
        self.topics_dict = {}
        self.brokers_list = []
        self.consumer = KafkaConsumer(bootstrap_servers=kafka_hosts.split(','))
        self.zk = KazooClient(hosts=zookeeper_hosts)
        self.zk.add_listener(self.keep_start)
        self.zk.start()
        if self.zk.exists('/consumers') is None or self.zk.exists('/brokers') is None:
            raise ValueError(zookeeper_hosts + 'is not zookeeper of kafka')
        ChildrenWatch(self.zk, '/consumers', self.groups_watch)
        ChildrenWatch(self.zk, '/brokers/topics', self.topics_watch)
        ChildrenWatch(self.zk, '/brokers/ids/', self.brokers_watch)
        t = threading.Thread(target=self.latest, name=kafka_hosts)
        t.setDaemon(True)
        t.start()

    # 保证链接是可用的
    def keep_start(self, client_status):
        if client_status != 'CONNECTED':
            try:
                self.zk.start()
            except():
                pass

    # 监听consumers节点
    def groups_watch(self, children):
        for group in [group for group in self.groups_dict.keys() if group not in children]:
            self.groups_dict.pop(group)
        for group in [group for group in children if group not in self.groups_dict.keys()]:
            owners_p = '/consumers/' + group + '/owners'
            if self.zk.exists(owners_p) is None:
                continue
            g_o_t = GroupOwnersTopic()
            self.groups_dict[group] = g_o_t
            ChildrenWatch(self.zk, owners_p, g_o_t.g_topic_watch)

    # 监听topic节点
    def topics_watch(self, children):
        for topic in [topic for topic in self.topics_dict.keys() if topic not in children]:
            self.topics_dict.pop(topic)
        for topic in [topic for topic in children if topic not in self.topics_dict.keys()]:
            t_v = TopicValue()
            self.topics_dict[topic] = t_v
            DataWatch(self.zk, '/brokers/topics/' + topic, t_v.topic_watch)
            t_v.topic_partition = [TopicPartition(topic, p) for p in self.consumer.partitions_for_topic(topic)]

    # 监听broker节点
    def brokers_watch(self, children):
        self.brokers_list = children

    def close_zk(self):
        try:
            self.zk.remove_listener(self.keep_start)
            self.zk.stop()
            self.zk.close()
        except():
            pass

    def latest(self):
        while True:
            # time.sleep(0.1)
            time.sleep(0.001)
            for k, v in self.topics_dict.items():
                try:
                    partitions = v.topic_partition
                    self.consumer.assign(partitions)
                    self.consumer.seek_to_end(*partitions)
                    log_offset = reduce(lambda x, y: x + y, [self.consumer.position(p) for p in partitions])
                    now_timestamp = int(time.mktime(time.localtime()))
                    if 'timestamp' in v.__dict__ and v.timestamp is not None:
                        v.speed = (log_offset - v.off_set) / (now_timestamp - v.timestamp)
                    v.timestamp = now_timestamp
                    v.off_set = log_offset
                except Exception as e:
                    pass
예제 #8
0
class ZookeeperWatcher():
    zoo_client = None  # The KazooClient to manage the config
    point_path = None  # Zookeeper path to pointed to file
    pointed_at_expired = None  # is True when the assignment has been set to
                               # None but we cannot remove the config listener
    valid_handler = None  # the function to call when the validity changes
    config_handler = None  # the function to call when the config changes
    error_handler = None  # the function to call when an error occurs in reading
    valid_file = False  # the current state of the ConfigWatcher with ZK
    do_not_restart = False  # used when closing via ^C
    old_data = ''  # The current file contents, to see if a change occurred
    old_pointed = ''  # the current pointed path, to see if change occurred

    INVALID_PATH = "Invalid pointer path"
    INVALID_GET = "Invalid get on file path"
    BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing"

    def __init__(self, hosts, filepath, valid_handler=None,
                 config_handler=None, error_handler=None, pointer=False,
                 ensure=False, valid_init=True):
        '''
        Zookeeper file watcher, used to tell a program their zookeeper file has
        changed. Can be used to watch a single file, or both a file and path of
        its contents. Manages all connections, drops, reconnections for you.

        @param hosts: The zookeeper hosts to use
        @param filepath: The full path to the file to watch
        @param valid_handler: The method to call for a 'is valid' state change
        @param config_handler: The method to call when a content change occurs
        @param error_handler: The method to call when an error occurs
        @param pointer: Set to true if the file contents are actually a path to
                        another zookeeper file, where the real config resides
        @param ensure: Set to true for the ZooWatcher to create the watched file
        @param valid_init: Ensure the client can connect to Zookeeper first try

        Ex 1. /stuff/A: "stuff I care about"
        Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about"
            - in Ex 2 you care about /other/stuff contents
              but are only aware of your assignment /stuff/A

        You can use this class as any combination of event driven or polling.
        Polling:
            In the main loop of your program, check if is_valid() is
            True, otherwise clear your contents as there is some ZK error.
        Event:
            You will be notified via the various handlers when content changes.
        '''
        self.hosts = hosts
        self.my_file = filepath
        self.pointer = pointer
        self.ensure = ensure
        self.valid_handler = valid_handler
        self.config_handler = config_handler
        self.error_handler = error_handler

        if valid_init:
            # this will throw an exception if it can't start right away
            self.zoo_client = KazooClient(hosts=self.hosts)
            self.zoo_client.start()

        self.threaded_start(no_init=True)

    def threaded_start(self, no_init=False):
        '''
        Spawns a worker thread to set up the zookeeper connection
        '''
        thread = Thread(target=self.init_connections, kwargs={
                        'no_init': no_init})
        thread.setDaemon(True)
        thread.start()
        thread.join()

    def init_connections(self, no_init=False):
        '''
        Sets up the initial Kazoo Client and watches
        '''
        success = False
        self.set_valid(False)

        if not no_init:
            if self.zoo_client:
                self.zoo_client.remove_listener(self.state_listener)
                self.old_data = ''
                self.old_pointed = ''

            while not success:
                try:
                    if self.zoo_client is None:
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                    else:
                        # self.zoo_client.stop()
                        self.zoo_client._connection.connection_stopped.set()
                        self.zoo_client.close()
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                except Exception, e:
                    print "ZKWatcher Exception:", e
                    sleep(1)
                    continue

                self.setup()
                success = self.update_file(self.my_file)
                sleep(5)
        else:
예제 #9
0
class Server(threading.Thread):
    '''
    工作服务器(也是ZooKeeper的客户端)
    '''
    # 控制输出信息的锁,注意:这个是单机器的锁,这里实现的是分布式锁,并不存在本末倒置
    print_mutex = threading.Lock()
    
    DELAY_TIME = 3
    
    def __init__(self, zk_server_address, lock_base_path, host, serve_mode):
        threading.Thread.__init__(self)
        # 锁的根节点路径
        self.lock_base_path = lock_base_path
        # 主机IP
        self.host = host
        # 工作模式,读/写
        self.serve_mode = serve_mode
        # 事件,初始化为False
        self.event = threading.Event()
        
        # 创建一个zookeeper客户端
        self.zkclient = KazooClient(zk_server_address)
        # 添加连接状态监听器
        self.zkclient.add_listener(self.zk_connect_listener)
        # 与zookeeper开启连接
        self.zkclient.start()
        
    
    # 连接状态监听器
    def zk_connect_listener(self, state):
        # 获取打印锁
        Server.print_mutex.acquire()
        if state == KeeperState.CONNECTED:
            print self.host + " 已经开启..."
        elif state == KazooState.LOST:
            print self.host + " 停止服务..."
        else:
            raise Exception(self.host + " 未正常开启...")   
        # 获取打印锁
        Server.print_mutex.release() 
      
        
    # 初始化
    def run(self):
        # 创建锁节点,形如/shared_lock/192.168.0.0-R-0000000001
        self.create_lock_node()
        # 获取锁
        self.acquire_lock()
        # 工作
        self.work()
        # 释放锁
        self.release_lock()
        # 准备停止
        self.stop()
        
        
    def create_lock_node(self):
        # 先检查父节点,如果父节点不存在
        if not self.zkclient.exists(self.lock_base_path):
            # 先创建父节点
            self.zkclient.create(self.lock_base_path)
        # 拼凑出服务器子节点的完整路径
        node_path = self.lock_base_path + "/" + self.host + "-" + self.serve_mode + "-"
        # 创建临时顺序节点
        self.node_path = self.zkclient.create(node_path, "", self.zkclient.default_acl, True, True)
    
    
    # 删除事件的响应
    def pre_node_delete_watch(self, data, stat, event):
        if event and event.type == EventType.DELETED:
            # 将事件设置为True
            self.event.set()
    
    
    # 获取锁
    def acquire_lock(self):
        # 提取出自己的节点名
        node_name = self.node_path.split("/")[-1]
        # 获取/shared_lock子节点排序列表
        sorted_children = self.get_sorted_children()
        # 得到节点的索引
        node_index = sorted_children.index(node_name)
        
        # 寻找最后一个写节点
        def get_last_write_node_index():
            # 逆向遍历
            for i in range(node_index)[::-1]:
                # 工作模式是节点名中的第二个部分
                serve_mode = sorted_children[i].split("-")[1]
                # 只要找到一个写请求,则立刻返回
                if serve_mode == "W":
                    return i
            # 如果全部都是读请求,则返回-1
            return -1
        
        # 如果是写请求,
        if self.serve_mode == "W":
            # 如果是,再判断自己是不是序号最小的节点
            if node_index == 0:
                # 立马返回,占用锁,开始写数据
                return
            # 如果不是,向比自己小的最后一个节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[node_index - 1]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
        # 如果是读请求
        else:
            # 得到所有比自己小的子节点中的最后一个写节点的下标
            last_write_node_index = get_last_write_node_index()
            # 判断以下两个条件是否成立
            # 1)没有比自己序号小的子节点
            # 2)或是所有比自己小的子节点都是读请求
            # 如果成立
            if node_index == 0 or last_write_node_index < 0:
                # 立马返回,占用共享锁,开始读数据
                return
            # 如果不成立,向比自己小的最后一个写节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[last_write_node_index]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
    
    
    def work(self):
        # 获取打印锁
        Server.print_mutex.acquire()
        # 如果是写请求,
        if self.serve_mode == "W":
            # 写一会数据,然后删除节点,关闭会话
            print self.host + " 正在写数据..."
        else:
            # 读一会数据,然后删除节点,关闭会话
            print self.host + " 正在读数据..."
        Server.print_mutex.release()
        # 这里暂停几秒钟。模拟工作耗时状态
        sleep(self.DELAY_TIME)
    
    
    # 释放锁
    def release_lock(self):
        # 删除自己的节点
        self.zkclient.delete(self.node_path)
    
    
    # 获取/shared_lock子节点排序列表
    def get_sorted_children(self):
        # 获取/shared_lock子节点列表
        children = self.zkclient.get_children(self.lock_base_path)
        ###############################################################
        # 这里sort函数的比较表达式是由两个函数实现,还挺有技巧的
        ###############################################################
        # 返回节点的序列号
        def get_lock_node_seq(node_name):
            # 分割字符串,然后返回列表最后一个元素,先将其转化为整型
            return string.atoi(node_name.split("-")[-1])
        # 编号比较r函数
        def sequence_compare(node1, node2):
            return get_lock_node_seq(node1) - get_lock_node_seq(node2)
        # 将列表排序
        children.sort(cmp = sequence_compare)
        
        return children
        
    
    # 停止工作
    def stop(self):
        # 移除事件监听器
        self.zkclient.remove_listener(self.pre_node_delete_watch)
        # 会话
        self.zkclient.stop()
        self.zkclient.close()    
예제 #10
0
class PartitionClient(object):
    """ Client Class for the Partition Library
    Example usage:
    ---------------------
    import libpartition
    from libpartition.libpartition import PartitionClient

    def own_change_cb(l):
            print "ownership change:" + str(l)

    c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, 
            own_change_cb, "zookeeper_s1")

    ##do some real work now"
    if (c.own_partition(1)):
        ...... do something with partition #1 .....
        .........
    ...
    c.update_cluster_list(["s1", "s2"])
    ...
    ----------------------
    You should not call any partition library routine from within the 
    callback function

    Args:
        app_name(str): Name of the app for which partition cluster is used
        self_name(str): Name of the local cluster node (can be ip address)
        cluster_list(list): List of all the nodes in the cluster including 
            local node
        max_partition(int): Partition space always go from 0..max_partition-1
        partition_update_cb: Callback function invoked when partition
            ownership list is updated.x
        zk_server(str): <zookeeper server>:<zookeeper server port>
    """
    def __init__(
            self, app_name, self_name, cluster_list, max_partition,
            partition_update_cb, zk_server, logger = None):
       
        # Initialize local variables
        self._zk_server = zk_server
        self._cluster_list = set(cluster_list)
        self._max_partition = max_partition
        self._update_cb = partition_update_cb
        self._curr_part_ownership_list = []
        self._target_part_ownership_list = []
        self._con_hash = ConsistentHash(cluster_list)
        self._name = self_name

        # some sanity check
        if not(self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        # connect to zookeeper
        while True:
            self._logger.error("Libpartition zk start")
            self._zk = KazooClient(zk_server)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        # create a lock array to contain locks for each partition
        self._part_locks = []
        for part in range(0, self._max_partition):
            lockpath = "/lockpath/"+ app_name + "/" + str(part)
            l = self._zk.Lock(lockpath, self._name)
            self._part_locks.append(l)

        # initialize partition # to lock acquire greenlet dictionary
        self._part_lock_task_dict = {}
       
        self._logger.error("initial servers:" + str(self._cluster_list))

        # update target partition ownership list
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end __init__

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER,
                name = 'Zookeeper', status = new_conn_state,
                message = message,
                server_addrs = self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and
                new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' %(message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state and
                new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state
    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Libpartition listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Libpartition connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all 
            # over again
            self._sandesh_connection_info_update(status='DOWN',
                                      message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Libpartition connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(status='INIT',
                message = 'Connection to zookeeper lost. Retrying')

    # following routine is the greenlet task function to acquire the lock
    # for a partition
    def _acquire_lock(self, part):
        # lock for the partition
        l = self._part_locks[part]

        # go in an infinite loop waiting to acquire the lock
        try:
            while True:
                ret = l.acquire(blocking=False)
                if ret == True:
                    self._logger.error("Acquired lock for:" + str(part))
                    self._curr_part_ownership_list.append(part)
                    self._update_cb(self._curr_part_ownership_list)
                    return True
                else:
                    gevent.sleep(1)
        except CancelledError:
            self._logger.error("Lock acquire cancelled for:" + str(part))
            return False
        except Exception as ex:
            # TODO: If we have a non-KazooException, the lock object
            #       may get stuck in the "cancelled" state
            self._logger.error("Lock acquire unexpected error!: " + str(ex))
            # This exception should get propogated to main thread
            raise SystemExit(1)
            return False
    #end _acquire_lock

    # get rid of finished spawned tasks from datastructures
    def _cleanup_greenlets(self):
        for part in self._part_lock_task_dict.keys():
            if (self._part_lock_task_dict[part].ready()):
                del self._part_lock_task_dict[part]
    #end _cleanup_greenlets 

    # following routine launches tasks to acquire partition locks
    def _acquire_partition_ownership(self):
        # cleanup any finished greenlets
        self._cleanup_greenlets()

        # this variable will help us decide if we need to call callback
        updated_curr_ownership = False 

        # list of partitions for which locks have to be released
        release_lock_list = []

        self._logger.info("known servers: %s" % self._con_hash.get_all_nodes())

        for part in range(0, self._max_partition):
            if (part in self._target_part_ownership_list):
                if (part in self._curr_part_ownership_list):
                    # do nothing, I already have ownership of this partition
                    self._logger.info("No need to acquire ownership of:" +
                            str(part))
                else:
                    # I need to acquire lock for this partition before I own
                    if (part in self._part_lock_task_dict.keys()):
                        try:
                            self._part_lock_task_dict[part].get(block=False)
                        except:
                            # do nothing there is already a greenlet running to
                            # acquire the lock
                            self._logger.error("Already a greenlet running to" 
                                    " acquire:" + str(part))
                            continue

                        # Greenlet died without getting ownership. Cleanup
                        self._logger.error("Cleanup stale greenlet running to" 
                                " acquire:" + str(part))
                        del self._part_lock_task_dict[part]

                    self._logger.error("Starting greenlet running to" 
                            " acquire:" + str(part))
                    # launch the greenlet to acquire the loc, k
                    g = Greenlet.spawn(self._acquire_lock, part)
                    self._part_lock_task_dict[part] = g

            else:
                # give up ownership of the partition

                # cancel any lock acquisition which is ongoing 
                if (part in self._part_lock_task_dict.keys()):
                    try:
                        self._part_lock_task_dict[part].get(block=False)
                    except:
                        
                        self._logger.error("canceling lock acquisition going on \
                            for:" + str(part))
                        # Cancelling the lock should result in killing the gevent
                        self._part_locks[part].cancel()
                        self._part_lock_task_dict[part].get(block=True)
                        
                    del self._part_lock_task_dict[part]
                        
                if (part in self._curr_part_ownership_list):
                    release_lock_list.append(part)
                    self._curr_part_ownership_list.remove(part)
                    updated_curr_ownership = True
                    self._logger.error("giving up ownership of:" + str(part))

        if (updated_curr_ownership is True):
            # current partition membership was updated call the callback 
            self._update_cb(self._curr_part_ownership_list)

        if (len(release_lock_list) != 0):
            # release locks which were acquired
            for part in release_lock_list:
                self._logger.error("release the lock which was acquired:" + \
                        str(part))
                try:
                    self._part_locks[part].release()
                    self._logger.error("fully gave up ownership of:" + str(part))
                except:
                    pass
    #end _acquire_partition_ownership

    def update_cluster_list(self, cluster_list):
        """ Updates the cluster node list
        Args:
            cluster_list(list): New list of names of the nodes in 
                the cluster
        Returns:
            None
        """
        # some sanity check
        if not(self._name in cluster_list):
            raise ValueError('cluster list is missing local server name')

        new_cluster_list = set(cluster_list)
        new_servers = list(new_cluster_list.difference(
            self._cluster_list))
        deleted_servers = list(set(self._cluster_list).difference(
            new_cluster_list)) 
        self._cluster_list = set(cluster_list)

        # update the hash structure
        if new_servers:
            self._logger.error("new servers:" + str(new_servers))
            self._con_hash.add_nodes(new_servers)
        if deleted_servers:
            self._logger.error("deleted servers:" + str(deleted_servers))
            self._con_hash.del_nodes(deleted_servers)

        # update target partition ownership list
        self._target_part_ownership_list = []
        for part in range(0, self._max_partition):
            if (self._con_hash.get_node(str(part)) == self._name):
                if not (part in self._target_part_ownership_list):
                    self._target_part_ownership_list.append(part)

        # update current ownership list
        self._acquire_partition_ownership()

    #end update_cluster_list

    def own_partition(self, part_no):
        """ Returns ownership information of a partition
        Args:
            part_no(int) : Partition no 
        Returns:
            True if partition is owned by the local node
            False if partition is not owned by the local node
        """
        return part_no in self._curr_part_ownership_list 
    #end own_partition

    def close(self):
        """ Closes any connections and frees up any data structures
        Args:
        Returns:
            None
        """
        # clean up greenlets
        for part in self._part_lock_task_dict.keys():
            try:
                self._logger.error("libpartition greenlet cleanup %s" % str(part))
                self._part_lock_task_dict[part].kill()
            except:
                pass

        self._zk.remove_listener(self._zk_listen)
        gevent.sleep(1)
        self._logger.error("Stopping libpartition")
        # close zookeeper
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping libpartition failed")
        else:
            self._logger.error("Stopping libpartition successful")

        self._logger.error("Closing libpartition")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing libpartition failed")
        else:
            self._logger.error("Closing libpartition successful")
예제 #11
0
파일: txkazoo.py 프로젝트: lvh/txkazoo
class TxKazooClient(object):
    """
    Twisted wrapper for `kazoo.client.KazooClient`

    Implements blocking methods of `kazoo.client.KazooClient` in seperate thread
    and return Deferred that fires with method's result or errbacks with any exception
    occurred during method execution
    """

    kz_get_attributes = ['handler', 'retry', 'state', 'client_state', 'client_id', 'connected']

    def __init__(self, **kwargs):
        """
        Initialize `TxKazooClient`

        Takes same arguments as KazooClient and extra keyword argument `threads`
        that suggests thread pool size to be used
        """
        threads = kwargs.pop('threads', 10)
        reactor.suggestThreadPoolSize(threads)

        log = kwargs.pop('txlog', None)
        if log:
            kwargs['logger'] = TxLogger(log)

        self.client = KazooClient(**kwargs)
        self._internal_listeners = dict()

    def __getattr__(self, name):
        """
        Delegates method executions to a thread pool. Does this by returning
        a function that calls `KazooClient.method` in seperate thread if `name` is
        method name

        :return: `Deferred` that fires with result of executed method if `name`
                 is name of method. Otherwise, `KazooClient.property` value is returned
        """
        if name in self.kz_get_attributes:
            # Assuming all attributes access are not blocking
            return getattr(self.client, name)
        return lambda *args, **kwargs: deferToThread(getattr(self.client, name), *args, **kwargs)

    def add_listener(self, listener):
        # This call does not block and is probably not thread safe. It is best if it
        # is called from twisted reactor thread only

        def _listener(state):
            # Called from kazoo thread. Replaying the original listener in reactor
            # thread
            reactor.callFromThread(listener, state)

        self._internal_listeners[listener] = _listener
        return self.client.add_listener(_listener)

    def remove_listener(self, listener):
        _listener = self._internal_listeners.pop(listener)
        self.client.remove_listener(_listener)

    def _watch_func(self, func, path, watch=None, **kwargs):
        if not watch:
            return deferToThread(func, path, **kwargs)

        def _watch(event):
            # Called from kazoo thread. Replaying in reactor
            reactor.callFromThread(watch, event)

        return deferToThread(func, path, watch=_watch, **kwargs)

    def exists(self, path, watch=None):
        return self._watch_func(self.client.exists, path, watch)

    def exists_async(self, path, watch=None):
        return self._watch_func(self.client.exists_async, path, watch)

    def get(self, path, watch=None):
        return self._watch_func(self.client.get, path, watch)

    def get_async(self, path, watch=None):
        return self._watch_func(self.client.get_async, path, watch)

    def get_children(self, path, watch=None, include_data=False):
        return self._watch_func(self.client.get_children, path, watch, include_data=include_data)

    def get_children_async(self, path, watch=None, include_data=False):
        return self._watch_func(self.client.get_children_async, path, watch, include_data=include_data)

    def Lock(self, path, identifier=None):
        """
        Return Twisted wrapper for `Lock` object corresponding to this client.
        """
        return Lock(self.client.Lock(path, identifier))

    def SetPartitioner(self, path, set, **kwargs):
        """
        Return Twisted wrapper for `SetPartitioner` object corresponding to this client.
        """
        return SetPartitioner(self.client, path, set, **kwargs)
예제 #12
0
class ZooHandler(object):
    def __init__(self):
        self.zookeeper_client = None
        if not settings.ZOOKEEPER_SETTING['enable']:
            logging.info('zookeeper disabled')
            return
        self.zoo_hosts = settings.ZOOKEEPER_SETTING['server_address']
        logging.info('start zookeeper client, zoo hosts: %s' % self.zoo_hosts)
        self.base_dir = settings.ZOOKEEPER_SETTING['base_dir']
        self.zookeeper_client = KazooClient(hosts=self.zoo_hosts)
        self.zookeeper_client.add_listener(self.state_listener)
        self.zookeeper_client.start_async()

    def state_listener(self, state):
        # session was lost
        if state == KazooState.LOST:
            logging.error('zookeeper lost!')
        # disconnected from Zookeeper
        elif state == KazooState.SUSPENDED:
            logging.error('zookeeper disconnected!')
        # connected/reconnected to Zookeeper
        elif state == KazooState.CONNECTED:
            self.register_node()
            logging.warn('zookeeper reconnected! try to register')
        else:
            logging.error('unexpected zookeeper state!!!')
            logging.critical('unexpected zookeeper state!!!')

    def register_node(self):
        if not self.zookeeper_client or not self.zookeeper_client.connected:
            logging.error('zoo not connected, register cancel')
            return
        path = ZooHandler.get_register_path()
        try:
            # 尝试注册节点
            def try_to_create_node(result):
                logging.info('zoo try_to_create_noe called')
                try:
                    # None表示节点不存在
                    if result.value is None:
                        self.zookeeper_client.create_async(path,
                                                           makepath=True,
                                                           ephemeral=True)
                    elif result.exception:
                        logging.fatal(
                            'critical error when try to check node when reconnected, %s',
                            result.exception)
                    else:
                        logging.warn(
                            'node already exists when reconnect and try to register'
                        )
                except BaseException as e:
                    logging.exception('critical error, %s', e.message)

            # 监控节点变化
            def node_watcher(watch_event):
                logging.info('zoo node_watcher called')
                try:
                    if EventType.DELETED == watch_event.type:
                        logging.warn('zoo nodes deleted, try recreate')
                        self.zookeeper_client.create_async(path,
                                                           makepath=True,
                                                           ephemeral=True)
                    if EventType.CHANGED == watch_event.type:
                        logging.warn('zoo nodes changed,do nothing')
                    if EventType.CHILD == watch_event.type:
                        logging.warn('zoo nodes childed,do nothing')
                    if EventType.CREATED == watch_event.type:
                        logging.info('zoo nodes success created')
                    if EventType.NONE == watch_event.type:
                        logging.error('zoo nodes status return None')
                finally:
                    self.zookeeper_client.exists_async(path,
                                                       watch=node_watcher)

            future = self.zookeeper_client.exists_async(path,
                                                        watch=node_watcher)
            future.rawlink(try_to_create_node)
        except ZookeeperError as e:
            logging.exception('zookeeper exception when register node: %s' %
                              e.message)
        except BaseException as e:
            logging.exception('critical error!')

    # 1. remove nodes,stop client
    def stop(self):
        logging.info('stopping zookeeper client')
        if self.zookeeper_client:
            self.zookeeper_client.remove_listener(self.state_listener)
            self.zookeeper_client.stop()
            logging.info('zookeeper stopped')

    @staticmethod
    def get_register_path():
        base_dir = settings.ZOOKEEPER_SETTING['base_dir']
        if base_dir[-1] == '/':
            base_dir = base_dir[0:-1]
        register_name = "%s/%s:%s:%s" % (
            base_dir, settings.ZOOKEEPER_SETTING['local_name'],
            settings.ZOOKEEPER_SETTING['local_ip'],
            settings.HTTP_SERVER_SETTING['port'])
        return register_name
예제 #13
0
class AnalyticsDiscovery(gevent.Greenlet):

    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER,
                name = self._svc_name, status = new_conn_state,
                message = message,
                server_addrs = self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and
                new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' %(message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state and
                new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state
    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            self._sandesh_connection_info_update(status='UP', message='')
            self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo))
            self._reconnect = True
        elif state == KazooState.LOST:
            self._logger.error("Analytics Discovery connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all 
            # over again
            self._sandesh_connection_info_update(status='DOWN',
                                      message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Analytics Discovery connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(status='INIT',
                message = 'Connection to zookeeper lost. Retrying')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self, logger, zkservers, svc_name, inst,
                watchers={}, zpostfix="", freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zkservers = zkservers
        self._zk = None
        self._pubinfo = None
        self._publock = Semaphore()
        self._watchers = watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):

        # This function can be called concurrently by the main AlarmDiscovery
        # processing loop as well as by clients.
        # It is NOT re-entrant
        self._publock.acquire()

        self._pubinfo = pubinfo
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")
        self._publock.release()

    def _run(self):
        while True:
            self._logger.error("Analytics Discovery zk start")
            self._zk = KazooClient(hosts=self._zkservers)
            self._zk.add_listener(self._zk_listen)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_listen)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._svc_name))
                finally:
                    self._zk = None
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                        partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._watchers[wk]:
                                self._watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath + "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                 del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.remove_listener(self._zk_listen)
                    gevent.sleep(1)
                    try:
                        self._zk.stop()
                    except:
                        self._logger.error("Stopping kazooclient failed")
                    else:
                        self._logger.error("Stopping kazooclient successful")
                    try:
                        self._zk.close()
                    except:
                        self._logger.error("Closing kazooclient failed")
                    else:
                        self._logger.error("Closing kazooclient successful")
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
예제 #14
0
class ZookeeperWatcher(object):
    zoo_client = None  # The KazooClient to manage the config
    point_path = None  # Zookeeper path to pointed to file
    pointed_at_expired = None  # is True when the assignment has been set to
                               # None but we cannot remove the config listener
    valid_handler = None  # the function to call when the validity changes
    config_handler = None  # the function to call when the config changes
    error_handler = None  # the function to call when an error occurs in reading
    valid_file = False  # the current state of the ConfigWatcher with ZK
    do_not_restart = False  # used when closing via ^C
    old_data = ''  # The current file contents, to see if a change occurred
    old_pointed = ''  # the current pointed path, to see if change occurred

    INVALID_PATH = "Invalid pointer path"
    INVALID_GET = "Invalid get on file path"
    BAD_CONNECTION = "Connection interrupted with Zookeeper, re-establishing"

    def __init__(self, hosts, filepath, valid_handler=None,
                 config_handler=None, error_handler=None, pointer=False,
                 ensure=False, valid_init=True):
        '''
        Zookeeper file watcher, used to tell a program their zookeeper file has
        changed. Can be used to watch a single file, or both a file and path of
        its contents. Manages all connections, drops, reconnections for you.

        @param hosts: The zookeeper hosts to use
        @param filepath: The full path to the file to watch
        @param valid_handler: The method to call for a 'is valid' state change
        @param config_handler: The method to call when a content change occurs
        @param error_handler: The method to call when an error occurs
        @param pointer: Set to true if the file contents are actually a path to
                        another zookeeper file, where the real config resides
        @param ensure: Set to true for the ZooWatcher to create the watched file
        @param valid_init: Ensure the client can connect to Zookeeper first try

        Ex 1. /stuff/A: "stuff I care about"
        Ex 2. /stuff/A: "/other/stuff", /other/stuff: "contents I care about"
            - in Ex 2 you care about /other/stuff contents
              but are only aware of your assignment /stuff/A

        You can use this class as any combination of event driven or polling.
        Polling:
            In the main loop of your program, check if is_valid() is
            True, otherwise clear your contents as there is some ZK error.
        Event:
            You will be notified via the various handlers when content changes.
        '''
        self.hosts = hosts
        self.my_file = filepath
        self.pointer = pointer
        self.ensure = ensure
        self.valid_handler = valid_handler
        self.config_handler = config_handler
        self.error_handler = error_handler

        if valid_init:
            # this will throw an exception if it can't start right away
            self.zoo_client = KazooClient(hosts=self.hosts)
            self.zoo_client.start()

        self.threaded_start(no_init=True)

    def threaded_start(self, no_init=False):
        '''
        Spawns a worker thread to set up the zookeeper connection
        '''
        thread = Thread(target=self.init_connections, kwargs={
                        'no_init': no_init})
        thread.setDaemon(True)
        thread.start()
        thread.join()

    def init_connections(self, no_init=False):
        '''
        Sets up the initial Kazoo Client and watches
        '''
        success = False
        self.set_valid(False)

        if not no_init:
            if self.zoo_client:
                self.zoo_client.remove_listener(self.state_listener)
                self.old_data = ''
                self.old_pointed = ''

            while not success:
                try:
                    if self.zoo_client is None:
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                    else:
                        # self.zoo_client.stop()
                        self.zoo_client._connection.connection_stopped.set()
                        self.zoo_client.close()
                        self.zoo_client = KazooClient(hosts=self.hosts)
                        self.zoo_client.start()
                except Exception as e:
                    log.error("ZKWatcher Exception: " + e.message)
                    sleep(1)
                    continue

                self.setup()
                success = self.update_file(self.my_file)
                sleep(5)
        else:
            self.setup()
            self.update_file(self.my_file)

    def setup(self):
        '''
        Ensures the path to the watched file exists and we have a state
        listener
        '''
        self.zoo_client.add_listener(self.state_listener)

        if self.ensure:
            self.zoo_client.ensure_path(self.my_file)

    def state_listener(self, state):
        '''
        Restarts the session if we get anything besides CONNECTED
        '''
        if state == KazooState.SUSPENDED:
            self.set_valid(False)
            self.call_error(self.BAD_CONNECTION)
        elif state == KazooState.LOST and not self.do_not_restart:
            self.threaded_start()
        elif state == KazooState.CONNECTED:
            # This is going to throw a SUSPENDED kazoo error
            # which will cause the sessions to be wiped and re established.
            # Used b/c of massive connection pool issues
            self.zoo_client.stop()

    def is_valid(self):
        '''
        @return: True if the currently watch file is valid
        '''
        return self.valid_file

    def ping(self):
        '''
        Simple command to test if the zookeeper session is able to connect
        at this very moment
        '''
        try:
            # dummy ping to ensure we are still connected
            self.zoo_client.server_version()
            return True
        except KazooException:
            return False

    def close(self, kill_restart=True):
        '''
        Use when you would like to close everything down
        @param kill_restart= Prevent kazoo restarting from occurring
        '''
        self.do_not_restart = kill_restart
        self.zoo_client.stop()
        self.zoo_client.close()

    def get_file_contents(self, pointer=False):
        '''
        Gets any file contents you care about. Defaults to the main file
        @param pointer: The the contents of the file pointer, not the pointed
        at file
        @return: A string of the contents
        '''
        if self.pointer:
            if pointer:
                return self.old_pointed
            else:
                return self.old_data
        else:
            return self.old_data

    def watch_file(self, event):
        '''
        Fired when changes made to the file
        '''
        if not self.update_file(self.my_file):
            self.threaded_start()

    def update_file(self, path):
        '''
        Updates the file watcher and calls the appropriate method for results
        @return: False if we need to keep trying the connection
        '''
        try:
            # grab the file
            result, stat = self.zoo_client.get(path, watch=self.watch_file)
        except ZookeeperError:
            self.set_valid(False)
            self.call_error(self.INVALID_GET)
            return False

        if self.pointer:
            if result is not None and len(result) > 0:
                self.pointed_at_expired = False
                # file is a pointer, go update and watch other file
                self.point_path = result
                if self.compare_pointer(result):
                    self.update_pointed()
            else:
                self.pointed_at_expired = True
                self.old_pointed = ''
                self.old_data = ''
                self.set_valid(False)
                self.call_error(self.INVALID_PATH)
        else:
            # file is not a pointer, return contents
            if self.compare_data(result):
                self.call_config(result)
            self.set_valid(True)

        return True

    def watch_pointed(self, event):
        '''
        Fired when changes made to pointed file
        '''
        self.update_pointed()

    def update_pointed(self):
        '''
        Grabs the latest file contents based on the pointer uri
        '''
        # only grab file if our pointer is still good (not None)
        if not self.pointed_at_expired:
            try:
                conf_string, stat2 = self.zoo_client.get(self.point_path,
                                                    watch=self.watch_pointed)
            except ZookeeperError:
                self.old_data = ''
                self.set_valid(False)
                self.pointed_at_expired = True
                self.call_error(self.INVALID_PATH)
                return

            if self.compare_data(conf_string):
                self.call_config(conf_string)
            self.set_valid(True)

    def set_valid(self, boolean):
        '''
        Sets the state and calls the change if needed
        @param bool: The state (true or false)
        '''
        old_state = self.is_valid()
        self.valid_file = boolean

        if old_state != self.valid_file:
            self.call_valid(self.valid_file)

    def call_valid(self, state):
        '''
        Calls the valid change function passed in
        @param valid_state: The new config
        '''
        if self.valid_handler is not None:
            self.valid_handler(self.is_valid())

    def call_config(self, new_config):
        '''
        Calls the config function passed in
        @param new_config: The new config
        '''
        if self.config_handler is not None:
            self.config_handler(new_config)

    def call_error(self, message):
        '''
        Calls the error function passed in
        @param message: The message to throw
        '''
        if self.error_handler is not None:
            self.error_handler(message)

    def compare_data(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_data != data:
            self.old_data = data
            return True
        return False

    def compare_pointer(self, data):
        '''
        Compares the string data
        @return: True if the data is different
        '''
        if self.old_pointed != data:
            self.old_pointed = data
            return True
        return False
예제 #15
0
class ZooKeeperDeadmanPlugin:
    def __init__(self, name, app):
        self.name = name
        self.app = app
        self._monitors = {}
        self.tick_time = (
            app.tick_time
        )  # seconds: this should match the zookeeper server tick time (normally specified in milliseconds)
        self.logger = logging
        self._takeovers = {}

    def _path(self, type, name=_missing):
        if name is _missing:
            name = self.app.my_id
        if name:
            return self._path_prefix + type + "/" + self._group_name + "-" + name
        else:
            return self._path_prefix + type

    def _lock_path(self, name):
        return self._path("lock", name)

    @subscribe
    def initialize(self):
        self._loop = asyncio.get_event_loop()
        self._zk = KazooClient(hosts=self.app.config["zookeeper"]["connection_string"])
        self._zk.add_listener(self._session_state_threadsafe)
        self._zk.start()
        self._path_prefix = self.app.config["zookeeper"]["path"].strip()
        if not self._path_prefix.endswith("/"):
            self._path_prefix += "/"
        self._group_name = self.app.config["zookeeper"]["group"].strip()
        if "/" in self._group_name or "-" in self._group_name:
            raise ValueError("cannot have - or / in the group name")

    def _session_state_threadsafe(self, state):
        self._loop.call_soon_threadsafe(self._loop.create_task, self._session_state(state))

    async def _session_state(self, state):
        unhealthy_key = "{}.no_zookeeper_connection".format(self.name)
        # runs in separate thread
        if state == KazooState.SUSPENDED:
            # we wait for the tick time before taking action to see if
            # our session gets re-established
            await asyncio.sleep(self.tick_time)
            # we have to assume we are irretrevably lost, minimum session
            # timeout in zookeeper is 2 * tick time so stop postgresql now
            # and let a failover happen
            if self._zk.state != KazooState.CONNECTED:
                self.app.unhealthy(
                    unhealthy_key, "No connection to zookeeper: {}".format(self._zk.state), can_be_replica=True
                )
        elif state == KazooState.LOST:
            self.app.restart(10)
            raise AssertionError("We should never get here")
        else:
            self.app.healthy(unhealthy_key)

    def _get_static(self, key):
        path = self._path("static", key)
        try:
            data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            return None
        return data

    def _set_static(self, key, data, overwrite=False):
        path = self._path("static", key)
        try:
            self._zk.create(path, data, makepath=True)
        except kazoo.exceptions.NodeExistsError:
            if overwrite:
                self._zk.set(path, data)
                return True
            return False
        return True

    @subscribe
    def dcs_set_database_identifier(self, database_id):
        database_id = database_id.encode("ascii")
        return self._set_static("database_identifier", database_id)

    @subscribe
    def dcs_get_database_identifier(self):
        data = self._get_static("database_identifier")
        if data is not None:
            data = data.decode("ascii")
        return data

    @subscribe
    def dcs_set_timeline(self, timeline):
        assert isinstance(timeline, int)
        existing = self.dcs_get_timeline()
        if existing > timeline:
            raise ValueError("Timelines can only increase.")
        timeline = str(timeline).encode("ascii")
        self._set_static("timeline", timeline, overwrite=True)

    @subscribe
    def dcs_get_timeline(self):
        data = self._get_static("timeline")
        if data is None:
            data = b"0"
        return int(data.decode("ascii"))

    def _dict_watcher(self, what, callback):
        def hook(state, key, from_val, to_val):
            callback(_get_clusters(state).get(self._group_name, {}))

        path = self._path(what, name=None)
        prefix = self._group_name
        try:
            watch = DictWatch(self._zk, path, hook, prefix=prefix)
        except kazoo.exceptions.NoNodeError:
            self._zk.create(path, makepath=True)
            return self._dict_watcher(what, callback)
        return watch

    @subscribe
    def dcs_watch(self, state=None, conn_info=None):
        path = self._lock_path("master")
        self._monitors["master_lock_watch"] = self._zk.DataWatch(path, self._master_lock_changes)
        if state:
            self._state_watcher = self._dict_watcher("state", state)
        if conn_info:
            self._state_watcher = self._dict_watcher("conn", conn_info)

    def _master_lock_changes(self, data, stat, event):
        if data is not None:
            data = data.decode("utf-8")
        self._loop.call_soon_threadsafe(self.app.master_lock_changed, data)

    @subscribe
    def dcs_get_lock_owner(self, name):
        path = self._lock_path(name)
        try:
            existing_data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            return None
        return existing_data.decode("utf-8")

    @subscribe
    def dcs_lock(self, name):
        data = self.app.my_id.encode("utf-8")
        path = self._lock_path(name)
        try:
            self._zk.create(path, data, ephemeral=True, makepath=True)
            return True
        except kazoo.exceptions.NodeExistsError:
            pass
        # lock exists, do we have it, can we break it?
        try:
            existing_data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            # lock broke while we were looking at it
            # try get it again
            return self.dcs_lock(name)
        if stat.owner_session_id == self._zk.client_id[0]:
            # we already own the lock
            return True
        elif data == existing_data:
            # it is our log, perhaps I am restarting. of there are 2 of me running!
            self._log_takeover(path)
            try:
                self._zk.delete(path, version=stat.version)
            except (kazoo.exceptions.NoNodeError, kazoo.exceptions.BadVersionError):
                # lock broke while we were looking at it
                pass
            # try get the lock again
            return self.dcs_lock(name)
        return False

    @subscribe
    def dcs_unlock(self, name):
        owner = self._get_lock_owner(name)
        if owner == self.app.my_id:
            self._zk.delete(self._lock_path(name))

    def _get_lock_owner(self, name):
        try:
            owner, stat = self._zk.get(self._lock_path(name))
        except kazoo.exceptions.NoNodeError:
            return None
        return owner.decode("utf-8")

    def _log_takeover(self, path):
        if self._takeovers.get(path, False):
            # hmm, I have taken over before, this is NOT good
            # maybe 2 of me are running
            self.logger.error(
                "Taking over again: {}\n"
                "This should not happen, check that you do not "
                "have 2 nodes with the same id running".format(path)
            )
        else:
            # first time I am taking over, probably normal operation after a restart
            self.logger.info("Taking over {}".format(path))
        self._takeovers[path] = True

    def _set_info(self, type, data):
        path = self._path(type)
        data = json.dumps(data)
        data = data.encode("ascii")
        try:
            stat = self._zk.set(path, data)
        except kazoo.exceptions.NoNodeError:
            stat = None
        if stat is not None and stat.owner_session_id != self._zk.client_id[0]:
            self._log_takeover(path)
            self._zk.delete(path)
            stat = None
        if stat is None:
            self._zk.create(self._path(type), data, ephemeral=True, makepath=True)

    @subscribe
    def dcs_set_conn_info(self, data):
        return self._set_info("conn", data)

    @subscribe
    def dcs_set_state(self, data):
        return self._set_info("state", data)

    def _get_all_info(self, type):
        dirpath = self._path_prefix + type
        try:
            children = self._zk.get_children(dirpath)
        except kazoo.exceptions.NoNodeError:
            return iter([])
        for name in children:
            if not name.startswith(self._group_name + "-"):
                continue
            data, state = self._zk.get(dirpath + "/" + name)
            state = json.loads(data.decode("ascii"))
            yield name[len(self._group_name + "-") :], state

    @subscribe
    def dcs_get_all_conn_info(self):
        return self._get_all_info("conn")

    @subscribe
    def dcs_get_all_state(self):
        return self._get_all_info("state")

    def _delete_info(self, type):
        try:
            self._zk.delete(self._path(type))
        except kazoo.exceptions.NoNodeError:
            pass

    @subscribe
    def dcs_delete_conn_info(self):
        return self._delete_info("conn")

    @subscribe
    def dcs_disconnect(self):
        # for testing only
        self._zk.remove_listener(self._session_state_threadsafe)
        self._zk.stop()
class ConsistentScheduler(object):
    '''
        LibPartitionHelper abstract out workers and work_items, and their
        mapping to partitions. So application can only deal with the work
        items it owns, without bothering about partition mapping.

        This class also provides syncronization premitives to ensure apps
        to clean up b4 giving up their partitions
    '''
    _MAX_WAIT_4_ALLOCATION = 6 + randint(0, 9)

    def __init__(self,
                 service_name=None,
                 zookeeper='127.0.0.1:2181',
                 delete_hndlr=None,
                 add_hndlr=None,
                 bucketsize=47,
                 item2part_func=None,
                 partitioner=None,
                 logger=None,
                 cluster_id=''):
        if logger:
            self._logger = logger
        else:
            self._logger = logging.getLogger(__name__)
        self._service_name = service_name or os.path.basename(sys.argv[0])
        self._item2part_func = item2part_func or self._device2partition
        self._zookeeper_srvr = zookeeper
        self._zk = None
        self._bucketsize = bucketsize
        self._delete_hndlr = delete_hndlr
        self._add_hndlr = add_hndlr
        self._partitioner = partitioner or self._partitioner_func
        self._partitions = {}
        self._con_hash = None
        self._last_log = ''
        self._last_log_cnt = 0
        self._partition_set = map(str, range(self._bucketsize))

        self._cluster_id = cluster_id
        if self._cluster_id:
            self._zk_path = '/' + self._cluster_id + '/contrail_cs' + '/' + self._service_name
        else:
            self._zk_path = '/'.join(['/contrail_cs', self._service_name])
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')

        while True:
            self._logger.error("Consistent scheduler zk start")
            self._zk = KazooClient(self._zookeeper_srvr,
                                   handler=SequentialGeventHandler())
            self._zk.add_listener(self._zk_lstnr)
            try:
                self._zk.start()
                while self._conn_state != ConnectionStatus.UP:
                    gevent.sleep(1)
                break
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                self._zk.remove_listener(self._zk_lstnr)
                try:
                    self._zk.stop()
                    self._zk.close()
                except Exception as ex:
                    template = "Exception {0} in Consistent scheduler zk stop/close. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s" % \
                        (messag, traceback.format_exc(), self._service_name))
                finally:
                    self._zk = None
                gevent.sleep(1)
        self._pc = self._zk.SetPartitioner(path=self._zk_path,
                                           set=self._partition_set,
                                           partition_func=self._partitioner)
        self._wait_allocation = 0
        gevent.sleep(0)

    def _sandesh_connection_info_update(self, status, message):
        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER,
                               name='Zookeeper',
                               status=new_conn_state,
                               message=message,
                               server_addrs=self._zookeeper_srvr.split(','))

        if ((self._conn_state and self._conn_state != ConnectionStatus.DOWN)
                and new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' % (message)
            self._supress_log(msg)
        if (self._conn_state and self._conn_state != new_conn_state
                and new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._supress_log(msg)

        self._conn_state = new_conn_state

    # end _sandesh_connection_info_update

    def _zk_lstnr(self, state):
        self._logger.error("Consistent scheduler listen %s" % str(state))
        if state == KazooState.CONNECTED:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
        elif state == KazooState.LOST:
            self._logger.error("Consistent scheduler connection LOST")
            # Lost the session with ZooKeeper Server
            # Best of option we have is to exit the process and restart all
            # over again
            self._sandesh_connection_info_update(
                status='DOWN', message='Connection to Zookeeper lost')
            os._exit(2)
        elif state == KazooState.SUSPENDED:
            self._logger.error("Consistent scheduler connection SUSPENDED")
            # Update connection info
            self._sandesh_connection_info_update(
                status='INIT',
                message='Connection to zookeeper lost. Retrying')

    def schedule(self, items, lock_timeout=30):
        gevent.sleep(0)
        ret = False
        if self._pc.failed:
            self._logger.error('Lost or unable to acquire partition')
            os._exit(2)
        elif self._pc.release:
            self._supress_log('Releasing...')
            self._release()
        elif self._pc.allocating:
            self._supress_log('Waiting for allocation...')
            self._pc.wait_for_acquire(lock_timeout)
            if self._wait_allocation < self._MAX_WAIT_4_ALLOCATION:
                self._wait_allocation += 1
            else:
                self._logger.error('Giving up after %d tries!' %
                                   (self._wait_allocation))
                os._exit(2)
        elif self._pc.acquired:
            self._supress_log('got work: ', list(self._pc))
            ret = True
            self._wait_allocation = 0
            self._populate_work_items(items)
            self._supress_log('work items: ',
                              self._items2name(self.work_items()),
                              'from the list', self._items2name(items))
        return ret

    def members(self):
        return list(self._con_hash.nodes)

    def partitions(self):
        return list(self._pc)

    def work_items(self):
        return sum(self._partitions.values(), [])

    def finish(self):
        self._inform_delete(self._partitions.keys())
        self._pc.finish()
        self._zk.remove_listener(self._zk_lstnr)
        gevent.sleep(1)
        try:
            self._zk.stop()
        except:
            self._logger.error("Stopping kazooclient failed")
        else:
            self._logger.error("Stopping kazooclient successful")
        try:
            self._zk.close()
        except:
            self._logger.error("Closing kazooclient failed")
        else:
            self._logger.error("Closing kazooclient successful")

    def _items2name(self, items):
        return map(lambda x: x.name, items)

    def _supress_log(self, *s):
        slog = ' '.join(map(str, s))
        dl = ''
        if slog != self._last_log_cnt:
            if self._last_log_cnt:
                dl += ' ' * 4
                dl += '.' * 8
                dl += '[last print repeats %d times]' % self._last_log_cnt
                self._last_log_cnt = 0
            dl += slog
            self._last_log = slog
            self._logger.debug(dl)
        else:
            self._last_log_cnt += 1

    def _consistent_hash(self, members):
        if self._con_hash is None:
            self._con_hash = ConsistentHash(members)
            self._logger.error('members: %s' % (str(self._con_hash.nodes)))
        cur, updtd = set(self._con_hash.nodes), set(members)
        if cur != updtd:
            newm = updtd - cur
            rmvd = cur - updtd
            if newm:
                self._logger.error('new members: %s' % (str(newm)))
                self._con_hash.add_nodes(list(newm))
            if rmvd:
                self._logger.error('members left: %s' % (str(rmvd)))
                self._con_hash.del_nodes(list(rmvd))
        return self._con_hash

    def _consistent_hash_get_node(self, members, partition):
        return self._consistent_hash(members).get_node(partition)

    def _partitioner_func(self, identifier, members, _partitions):
        partitions = [p for p in _partitions \
            if self._consistent_hash_get_node(members, p) == identifier]
        self._logger.error('partitions: %s' % (str(partitions)))
        return partitions

    def _release(self):
        old = set(self._pc)
        new = set(
            self._partitioner(self._pc._identifier, list(self._pc._party),
                              self._partition_set))
        rmvd = old - new
        added = new - old
        if rmvd:
            self._inform_delete(list(rmvd))
        if added:
            self._inform_will_add(list(added))
        self._pc.release_set()

    def _list_items_in(self, partitions):
        return sum([self._partitions[k] for k in partitions if k in \
                    self._partitions], [])

    def _inform_will_add(self, partitions):
        if callable(self._add_hndlr):
            self._add_hndlr(self._list_items_in(partitions))

    def _inform_delete(self, partitions):
        if callable(self._delete_hndlr):
            self._delete_hndlr(self._list_items_in(partitions))

    def _populate_work_items(self, items):
        self._refresh_work_items()
        for i in items:
            part = str(self._item2part_func(i.name))
            if part in list(self._pc):
                if part not in self._partitions:
                    self._partitions[part] = []
                if i.name not in map(lambda x: x.name, self._partitions[part]):
                    self._partitions[part].append(i)
        self._logger.debug('@populate_work_items(%s): done!' % ' '.join(
            map(
                lambda v: str(v[0]) + ':' + ','.join(
                    map(lambda x: x.name, v[1])), self._partitions.items())))
        gevent.sleep(0)

    def _device2partition(self, key):
        return struct.unpack(
            'Q',
            hashlib.md5(key).digest()[-8:])[0] % self._bucketsize

    def _refresh_work_items(self):
        for k in self._partitions:
            self._partitions[k] = []
예제 #17
0
파일: zookeeper.py 프로젝트: ingted/zgres
class ZooKeeperDeadmanPlugin:

    def __init__(self, name, app):
        self.name = name
        self.app = app
        self._monitors = {}
        self.tick_time = app.tick_time # seconds: this should match the zookeeper server tick time (normally specified in milliseconds)

    def _path(self, type, name=_missing):
        if name is _missing:
            name = self.app.my_id
        if name:
            return self._path_prefix + type + '/' + self._group_name + '-' + name
        else:
            return self._path_prefix + type

    def _lock_path(self, name):
        return self._path('lock', name)

    @subscribe
    def initialize(self):
        self._loop = asyncio.get_event_loop()
        self._zk = KazooClient(hosts=self.app.config['zookeeper']['connection_string'])
        self._zk.add_listener(self._session_state_threadsafe)
        self._zk.start()
        self._path_prefix = self.app.config['zookeeper']['path'].strip()
        if not self._path_prefix.endswith('/'):
            self._path_prefix += '/'
        self._group_name = self.app.config['zookeeper']['group'].strip()
        if '/' in self._group_name or '-' in self._group_name:
            raise ValueError('cannot have - or / in the group name')

    def _session_state_threadsafe(self, state):
        self._loop.call_soon_threadsafe(self._loop.create_task, self._session_state(state))

    async def _session_state(self, state):
        unhealthy_key = '{}.no_zookeeper_connection'.format(self.name)
        # runs in separate thread
        if state == KazooState.SUSPENDED:
            # we wait for the tick time before taking action to see if
            # our session gets re-established
            await asyncio.sleep(self.tick_time)
            # we have to assume we are irretrevably lost, minimum session
            # timeout in zookeeper is 2 * tick time so stop postgresql now
            # and let a failover happen
            if self._zk.state != KazooState.CONNECTED:
                self.app.unhealthy(unhealthy_key, 'No connection to zookeeper: {}'.format(self._zk.state), can_be_replica=True)
        elif state == KazooState.LOST:
            self.app.restart(10)
            raise AssertionError('We should never get here')
        else:
            self.app.healthy(unhealthy_key)

    def _get_static(self, key):
        path = self._path('static', key)
        try:
            data, stat = self._zk.get(path)
        except kazoo.exceptions.NoNodeError:
            return None
        return data
    
    def _set_static(self, key, data, overwrite=False):
        path = self._path('static', key)
        try:
            self._zk.create(path, data, makepath=True)
        except kazoo.exceptions.NodeExistsError:
            if overwrite:
                self._zk.set(path, data)
                return True
            return False
        return True

    @subscribe
    def dcs_set_database_identifier(self, database_id):
        database_id = database_id.encode('ascii')
        return self._set_static('database_identifier', database_id)

    @subscribe
    def dcs_get_database_identifier(self):
        data = self._get_static('database_identifier')
        if data is not None:
            data = data.decode('ascii')
        return data

    @subscribe
    def dcs_set_timeline(self, timeline):
        assert isinstance(timeline, int)
        existing = self.dcs_get_timeline()
        if existing > timeline:
            raise ValueError('Timelines can only increase.')
        timeline = str(timeline).encode('ascii')
        self._set_static('timeline', timeline, overwrite=True)

    @subscribe
    def dcs_get_timeline(self):
        data = self._get_static('timeline')
        if data is None:
            data = b'0'
        return int(data.decode('ascii'))

    def _dict_watcher(self, what, callback):
        def hook(state, key, from_val, to_val):
            callback(_get_clusters(state).get(self._group_name, {}))
        path = self._path(what, name=None)
        prefix = self._group_name
        try:
            watch = DictWatch(self._zk, path, hook, prefix=prefix)
        except kazoo.exceptions.NoNodeError:
            self._zk.create(path, makepath=True)
            return self._dict_watcher(what, callback)
        return watch

    @subscribe
    def dcs_watch(self, state=None, conn_info=None):
        path = self._lock_path('master')
        self._monitors['master_lock_watch'] = self._zk.DataWatch(path, self._master_lock_changes)
        if state:
            self._state_watcher = self._dict_watcher('state', state)
        if conn_info:
            self._state_watcher = self._dict_watcher('conn', conn_info)

    def _master_lock_changes(self, data, stat, event):
        if data is not None:
            data = data.decode('utf-8')
        self._loop.call_soon_threadsafe(self.app.master_lock_changed, data)

    @subscribe
    def dcs_lock(self, name):
        path = self._lock_path(name)
        try:
            self._zk.create(path, self.app.my_id.encode('utf-8'), ephemeral=True, makepath=True)
            return True
        except kazoo.exceptions.NodeExistsError:
            pass
        try:
            owner, stat = self._zk.get(self._lock_path(name))
        except kazoo.exceptions.NoNodeError:
            return False
        if stat.owner_session_id == self._zk.client_id[0]:
            return True
        return False

    @subscribe
    def dcs_unlock(self, name):
        owner = self._get_lock_owner(name)
        if owner == self.app.my_id:
            self._zk.delete(self._lock_path(name))

    def _get_lock_owner(self, name):
        try:
            owner, stat = self._zk.get(self._lock_path(name))
        except kazoo.exceptions.NoNodeError:
            return None
        return owner.decode('utf-8')

    def _set_info(self, type, data):
        data = json.dumps(data)
        data = data.encode('ascii')
        try:
            self._zk.set(self._path(type), data)
        except kazoo.exceptions.NoNodeError:
            self._zk.create(self._path(type), data, ephemeral=True, makepath=True)

    @subscribe
    def dcs_set_conn_info(self, data):
        return self._set_info('conn', data)

    @subscribe
    def dcs_set_state(self, data):
        return self._set_info('state', data)

    def _get_all_info(self, type):
        dirpath = self._path_prefix + type
        try:
            children = self._zk.get_children(dirpath, include_data=True)
        except kazoo.exceptions.NoNodeError:
            return iter([])
        for name, data in children:
            if not name.startswith(self._group_name + '-'):
                continue
            data = data['data']
            state = json.loads(data.decode('ascii')) 
            yield name[len(self._group_name + '-'):], state
    
    @subscribe
    def dcs_get_all_conn(self):
        return self._get_all_info('conn')

    @subscribe
    def dcs_get_all_state(self):
        return self._get_all_info('state')

    def _delete_info(self, type):
        try:
            self._zk.delete(self._path(type))
        except kazoo.exceptions.NoNodeError:
            pass
    
    @subscribe
    def dcs_delete_conn_info(self):
        return self._delete_info('conn')

    @subscribe
    def dcs_delete_state(self):
        return self._get_all_info('state')

    @subscribe
    def dcs_disconnect(self):
        # for testing only
        self._zk.remove_listener(self._session_state_threadsafe)
        self._zk.stop()