def _get_json_type(request, cluster_id, type): data = [] error_brokers = 0 try: cluster = get_cluster_or_404(id=cluster_id) zk = KazooClient(hosts=cluster['zk_host_ports']) zk.start() if type == "broker": brokers, error_brokers = _get_brokers(zk,cluster_id) for broker in brokers: data.append(broker['host']) if type == "topic": topics, error_zk_topics = _get_topics(cluster) for topic in topics: data.append(topic['id']) if type == "metric": data = _get_sections_ini() except KazooException: error_zk_brokers = 1 zk.stop() return JsonResponse(data, safe=False)
def main(): parser = argparse.ArgumentParser(description = DESCRIPTION) parser.add_argument("hosts", metavar = "<zookeeper-endpoint>", type = str, nargs = "+", help = "Zookeeper node endpoints to connect to") parser.add_argument("--timeout", dest = "timeout", action = "store", type = int, default = 30, help = "Zookeeper connection timeout") option = parser.parse_args() logging.debug("Using %s as a Zookeeper connection string" % option.hosts) client = KazooClient(hosts = ",".join(option.hosts)) try: client.start(timeout = option.timeout) except TimeoutError as e: logging.error("Timed out while connecting to Zookeeper") return 1 status = bootstrap(client, str(uuid.uuid4())) # If the client is not stopped, it will hang forever maintaining the connection. client.stop() return status
def main(): if len(sys.argv) < 5: print(help_msg, "\n") print(sys.argv[0], 'zookeeper_server broker_uri_1 broker_uri_2 broker_uri_3') print('Example:', sys.argv[0], 'localhost:2181 socket://localhost:10001/broker1 ' 'socket://localhost:10002/broker2 socket://localhost:10003/broker3') exit() zk_server = sys.argv[1] broker_uris = sys.argv[2:5] shift_cmd = shift_cmd_template.format(*broker_uris) print('Deploying', shift_cmd) zk_client = KazooClient(zk_server, timeout=10 * 60) print('Connecting to Zookeeper at', zk_server) zk_client.start() for uri in broker_uris: broker_status[uri] = False bid = get_id(uri) # make sure broker is free data, stats = zk_client.get(ZK_BROKER_OPS_STATUS_STR.format(bid)) op_status = OpStatus(data.decode('utf-8').upper()) if op_status not in [OpStatus.Null, OpStatus.Finished]: raise RuntimeError('Cannot start {}, {} is in {} state'.format(shift_cmd, bid, op_status.name)) # update broker's ops status zk_client.set(ZK_BROKER_OPS_STATUS_STR.format(bid), OpStatus.Null.value.encode('utf-8')) # write the cmd to the broker's ops zk_client.set(ZK_BROKER_OPS_STR.format(bid), shift_cmd.encode('utf-8')) # set watches for this broker's op status DataWatch(zk_client, ZK_BROKER_OPS_STATUS_STR.format(bid), func=get_broker_op_data_watcher(uri)) print('Waiting for brokers ...') while not all_done(): time.sleep(1)
def create_from_zookeeper(cls, zkconnect): log.info("Connecting to zookeeper {0}".format(zkconnect)) try: zk = KazooClient(zkconnect) zk.start() except Exception as e: raise ZookeeperException("Cannot connect to Zookeeper: {0}".format(e)) # Get broker list cluster = cls() add_brokers_from_zk(cluster, zk) # Get current partition state log.info("Getting partition list from Zookeeper") for topic in zk.get_children("/brokers/topics"): zdata, zstat = zk.get("/brokers/topics/{0}".format(topic)) add_topic_with_replicas(cluster, topic, json.loads(zdata)) if cluster.num_topics() == 0: raise ZookeeperException("The cluster specified does not have any topics") log.info("Closing connection to zookeeper") zk.stop() zk.close() return cluster
def readAMHostPort(self): amHost = "" amSecuredPort = "" zk = None try: zk = KazooClient(hosts=self.zk_quorum, read_only=True) zk.start() data, stat = zk.get(self.zk_reg_path) logger.debug("Registry Data: %s" % (data.decode("utf-8"))) sliderRegistry = json.loads(data) amUrl = sliderRegistry["payload"]["internalView"]["endpoints"]["org.apache.slider.agents"]["address"] amHost = amUrl.split("/")[2].split(":")[0] amSecuredPort = amUrl.split(":")[2].split("/")[0] # the port needs to be utf-8 encoded amSecuredPort = amSecuredPort.encode('utf8', 'ignore') except Exception: # log and let empty strings be returned logger.error("Could not connect to zk registry at %s in quorum %s" % (self.zk_reg_path, self.zk_quorum)) pass finally: if not zk == None: zk.stop() zk.close() logger.info("AM Host = %s, AM Secured Port = %s" % (amHost, amSecuredPort)) return amHost, amSecuredPort
def get_alive_master_ip(): zk_conn_str = get_os_env('ZOOKEEPER_CONN_STR') master_stack_name = get_os_env('MASTER_STACK_NAME') master_ip = "" global region if zk_conn_str != "": from kazoo.client import KazooClient zk = KazooClient(hosts=zk_conn_str) zk.start() try: master_ip = zk.get("/spark/leader_election/current_master")[0].decode('utf-8') zk.stop() except: master_ip = "" zk.stop() return master_ip elif master_stack_name != "" and region is not None: try: elb = boto3.client('elb', region_name=region) ec2 = boto3.client('ec2', region_name=region) master_ips = get_instance_ips(elb, ec2, master_stack_name) if len(master_ips) != 1: return "" # shouldn't happen without zookeeper elif len(master_ips) == 1: return master_ips[0] else: return "" except: return "" else: return ""
def _get_zk_conn(hosts): global ZK_CONNECTION if ZK_CONNECTION is None: ZK_CONNECTION = KazooClient(hosts=hosts) ZK_CONNECTION.start() return ZK_CONNECTION
def start_zoo(cport): ''' Client uses this function to start an instance of zookeeper Arguments: cport : An unused TCP port for zookeeper to use as the client port ''' basefile = "zookeeper-3.4.5" tarfile = os.path.dirname(os.path.abspath(__file__)) + "/" + basefile + ".tar.gz" cassbase = "/tmp/zoo." + str(cport) + "/" confdir = cassbase + basefile + "/conf/" output,_ = call_command_("mkdir " + cassbase) logging.info('Installing zookeeper in ' + cassbase + " conf " + confdir) os.system("cat " + tarfile + " | tar -xpzf - -C " + cassbase) output,_ = call_command_("cp " + confdir + "zoo_sample.cfg " + confdir + "zoo.cfg") logging.info('zookeeper Client Port %d' % cport) replace_string_(confdir + "zoo.cfg", \ [("dataDir=/tmp/zookeeper", "dataDir="+cassbase)]) replace_string_(confdir + "zoo.cfg", \ [("clientPort=2181", "clientPort="+str(cport))]) output,_ = call_command_(cassbase + basefile + "/bin/zkServer.sh start") zk = KazooClient(hosts='127.0.0.1:'+str(cport)) zk.start() zk.stop()
def _open(self): conninfo = self.connection.client self.vhost = os.path.join('/', conninfo.virtual_host[0:-1]) hosts = [] if conninfo.alt: for host_port in conninfo.alt: if host_port.startswith('zookeeper://'): host_port = host_port[len('zookeeper://'):] if not host_port: continue try: host, port = host_port.split(':', 1) host_port = (host, int(port)) except ValueError: if host_port == conninfo.hostname: host_port = (host_port, conninfo.port or DEFAULT_PORT) else: host_port = (host_port, DEFAULT_PORT) hosts.append(host_port) host_port = (conninfo.hostname, conninfo.port or DEFAULT_PORT) if host_port not in hosts: hosts.insert(0, host_port) conn_str = ','.join(['%s:%s' % (h, p) for h, p in hosts]) conn = KazooClient(conn_str) conn.start() return conn
def kafka_save(key,content,kafka,pre=''): zookeeper = KazooClient() zookeeper.start() cluster = Cluster(zookeeper) topic = cluster.topics['topicname'] topic.publish('msg') pass
def setup(self): zk = KazooClient(hosts=self.addr) zk.start() self.zk = zk cfg = self.app.cfg log = cfg.logger_class(cfg) self.log = log
def resolve_master( cluster_url, master_callback=lambda: True, termination_callback=lambda: True, zk_client=None): """ Resolve the MySQL cluster master's endpoint from the given URL for this cluster. :param cluster_url: The ZooKeeper URL for this cluster. :param master_callback: A callback method with one argument: the ServiceInstance for the elected master. :param termination_callback: A callback method with no argument. Invoked when the cluster terminates. :param zk_client: Use a custom ZK client instead of Kazoo if specified. """ try: _, zk_servers, cluster_path = zookeeper.parse(cluster_url) except Exception as e: raise ValueError("Invalid cluster_url: %s" % e.message) if not zk_client: zk_client = KazooClient(zk_servers) zk_client.start() listener = ClusterListener( zk_client, cluster_path, None, master_callback=master_callback, termination_callback=termination_callback) listener.start()
class ZKTestBase(unittest.TestCase): @classmethod def setUpClass(cls): utdocker.pull_image(zk_tag) def setUp(self): utdocker.create_network() utdocker.start_container( zk_name, zk_tag, env={ "ZOO_MY_ID": 1, "ZOO_SERVERS": "server.1=0.0.0.0:2888:3888", }, port_bindings={2181: 21811} ) self.zk = KazooClient('127.0.0.1:21811') self.zk.start() self.zkauthed, _ = zkutil.kazoo_client_ext( {'hosts': '127.0.0.1:21811', 'auth': ('digest', 'xp', '123'), 'acl': (('xp', '123', 'cdrwa'), ('foo', 'bar', 'rw'))}) dd('start zk-test in docker') def tearDown(self): self.zk.stop() self.zkauthed.stop() utdocker.remove_container(zk_name)
class ActorAddressBook(object): def __init__(self, zk_hosts, timeout=60.0): self.retry = KazooRetry(max_tries=10) self.zk = KazooClient(hosts=zk_hosts, timeout=timeout) self.zk.start() def lookup(self, path): return self.retry(self._lookup, path) def _lookup(self, path): actor_url, stat = self.zk.get(path) return RemoteActor(actor_url.decode('utf-8')) def register(self, path, actor_url): return self.retry(self._register, path, actor_url) def _register(self, path, actor_url): self.zk.ensure_path(path) self.zk.set(path, actor_url.encode('utf-8')) def delete(self, path): self.zk.delete(path, recursive=True) def __del__(self): self.zk.stop()
def init_codis_info(self): if self.has_init(): return # start zookeeper client zk_client = KazooClient(hosts=self.zk_addr) zk_client.start() # get codis server information zk_servers_dir = "/zk/codis/db_%s/servers" % self.product_name for zk_server in zk_client.get_children(zk_servers_dir): zk_server_path = '/'.join((zk_servers_dir, zk_server)) for server in zk_client.get_children(zk_server_path): server_path = '/'.join((zk_server_path, server)) data, stat = zk_client.get(server_path) server_info = json.loads(data) group_id = server_info.get('group_id') server_type = server_info.get('type') server_addr = server_info.get('addr') self.add_codis_server(group_id, server_type, server_addr) # get codis proxy information zk_proxy_dir = "/zk/codis/db_%s/proxy" % self.product_name for zk_proxy in zk_client.get_children(zk_proxy_dir): zk_proxy_path = '/'.join((zk_proxy_dir, zk_proxy)) data, stat = zk_client.get(zk_proxy_path) proxy_info = json.loads(data) self.add_proxy(proxy_info['id'], proxy_info['addr'], proxy_info['debug_var_addr'], proxy_info['state']) self.redis_client.init_connection(self.get_group_info(), self.get_proxy_info()) self.init_done() return None
def main(): """ Starts the groomer. """ logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('-v', '--verbose', action='store_true', help='Output debug-level logging') args = parser.parse_args() if args.verbose: logger.setLevel(logging.DEBUG) zk_hosts = appscale_info.get_zk_node_ips() zk_client = KazooClient(hosts=','.join(zk_hosts), connection_retry=ZK_PERSISTENT_RECONNECTS, command_retry=KazooRetry(max_tries=-1)) zk_client.start() db_access = DatastoreProxy() thread_pool = ThreadPoolExecutor(4) TransactionGroomer(zk_client, db_access, thread_pool) logger.info('Starting transaction groomer') IOLoop.current().start()
def expire_session(self, client_id=None): """Force ZK to expire a client session :param client_id: id of client to expire. If unspecified, the id of self.client will be used. """ client_id = client_id or self.client.client_id lost = threading.Event() safe = threading.Event() def watch_loss(state): if state == KazooState.LOST: lost.set() if lost.is_set() and state == KazooState.CONNECTED: safe.set() return True self.client.add_listener(watch_loss) # Sometimes we have to do this a few times attempts = 0 while attempts < 5 and not lost.is_set(): client = KazooClient(self.hosts, client_id=client_id, timeout=0.8) client.start() client.stop() lost.wait(5) attempts += 1 # Wait for the reconnect now safe.wait(15) self.client.retry(self.client.get_async, '/')
def __setstate__(self, state): hosts = state.pop('client') client = KazooClient(hosts) client.start() self.__dict__ = state self.client = client
def from_task(self, task, sandbox): data = json.loads(task.data) cluster_name, host, port, zk_url = data['cluster'], data['host'], data['port'], data['zk_url'] _, servers, path = parse(zk_url) kazoo = KazooClient(servers) kazoo.start() self_instance = ServiceInstance(Endpoint(host, port)) try: task_control = self._task_control_provider.from_task(task, sandbox) installer = self._installer_provider.from_task(task, sandbox) backup_store = self._backup_store_provider.from_task(task, sandbox) except (TaskControl.Error, PackageInstaller.Error) as e: kazoo.stop() # Kazoo needs to be cleaned up. See kazoo/issues/217. raise TaskError(e.message) state_manager = StateManager(sandbox, backup_store) return MysosTaskRunner( self_instance, kazoo, get_cluster_path(path, cluster_name), installer, task_control, state_manager)
def get_zoo_client(cluster_name="qconf"): """get zoo client by cluster_name """ global ZOO_CLIENTS if cluster_name not in ZOO_CLIENTS: # get zookeeper hosts info zookeeper = ZdZookeeper.one(cluster_name=cluster_name, deleted="0") if not zookeeper: raise ZookeeperConfError("Zookeeper not configured for cluster: {}!".format(cluster_name)) # connect to zookeeper try: client = KazooClient(hosts=zookeeper.hosts, connection_retry={"max_tries": 3, "backoff": 2}) client.start(3) ZOO_CLIENTS[cluster_name] = client except KazooTimeoutError as exc: log.error('Failed to connnect zookeeper, %s', str(exc)) return # check connection's state, if not connected, reconect zoo_client = ZOO_CLIENTS[cluster_name] if not zoo_client.connected: zoo_client.restart() return zoo_client
class PinotZk(object): def __init__(self, config, logger, fabric): self.config = config self.fabric = fabric self.logger = logger self.zk = None def get_handle(self): host = self.config.get_zk_host(self.fabric) if not self.zk: try: self.zk = KazooClient(hosts=host) self.zk.start() except kazoo.exceptions.KazooException: error = 'Failed connecting to zk {0}'.format(host) self.logger.exception(error) raise PinotException(error) return self.zk def close(self): if self.zk: self.zk.stop() self.zk.close()
def init_hierarchy(hosts, hierarchy, users, auth): zkcli = KazooClient(hosts) zkcli.start() scheme, name, passw = auth zkcli.add_auth(scheme, name + ':' + passw) def _init_hierarchy(hierarchy, parent_path): if len(hierarchy) == 0: return for node, attr_children in hierarchy.items(): val = attr_children.get('__val__', {}) val = utfjson.dump(val) acl = attr_children.get('__acl__') path = _init_node(zkcli, parent_path, node, val, acl, users) children = {k: v for k, v in attr_children.items() if k not in ('__val__', '__acl__') } _init_hierarchy(children, path) _init_hierarchy(hierarchy, '/') close_zk(zkcli)
def chunk(args=None): args = chunk_parser.parse_args(args) # Log verbosity verbosity = args.verbose - args.quiet if args.debug: log_level = logging.DEBUG - verbosity*10 else: log_level = logging.WARN - verbosity*10 logging.basicConfig(level=log_level) logging.getLogger('kazoo.client').setLevel(log_level + 20) # Zookeeper servers if len(args.servers): zk_hosts = ','.join(args.servers) else: zk_hosts = '127.0.0.1:2181' # Zookeeper client zk = KazooClient(hosts=zk_hosts) zk.start() # ChunkServer cs = HTTPChunkServer(zk=zk, addr=(args.host,args.port), cache_path=args.chunk_cache, hash_data=args.hash_data) cs.run() # Cleanup zk.stop()
class Exhibitor: def __init__(self, exhibitor, chroot): self.chroot = chroot self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'], exhibitor['port'], poll_interval=30) self.client = KazooClient(hosts=self.exhibitor.zookeeper_hosts + self.chroot, command_retry={ 'deadline': 10, 'max_delay': 1, 'max_tries': -1}, connection_retry={'max_delay': 1, 'max_tries': -1}) self.client.add_listener(self.session_listener) self.client.start() def session_listener(self, state): pass def _poll_exhibitor(self): if self.exhibitor.poll(): self.client.set_hosts(self.exhibitor.zookeeper_hosts + self.chroot) def get(self, *params): self._poll_exhibitor() return self.client.retry(self.client.get, *params) def get_children(self, *params): self._poll_exhibitor() try: return self.client.retry(self.client.get_children, *params) except NoNodeError: return []
def main(): global datastore_path global deployment_config logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, default=DEFAULT_PORT, required=True, help="The blobstore server's port") parser.add_argument('-d', '--datastore-path', required=True, help='The location of the datastore server') args = parser.parse_args() datastore_path = args.datastore_path zk_ips = appscale_info.get_zk_node_ips() zk_client = KazooClient(hosts=','.join(zk_ips)) zk_client.start() deployment_config = DeploymentConfig(zk_client) setup_env() http_server = tornado.httpserver.HTTPServer( Application(), max_buffer_size=MAX_REQUEST_BUFF_SIZE, xheaders=True) http_server.listen(args.port) # Make sure this server is accessible from each of the load balancers. secret = appscale_info.get_secret() for load_balancer in appscale_info.get_load_balancer_ips(): acc = AppControllerClient(load_balancer, secret) acc.add_routing_for_blob_server() logger.info('Starting BlobServer on {}'.format(args.port)) tornado.ioloop.IOLoop.instance().start()
def main_loop(): logging.basicConfig() zk = KazooClient(hosts=zk_connect_string) zk.start() # make sure the root folders for the sendgraph and the schedules exist zk.ensure_path(metrics_zk_path) zk.ensure_path(schedule_zk_path) for topology in zk.get_children(metrics_zk_path): topology_metrics_zk_path = metrics_zk_path + "/" + topology print("registering watcher schedule for " + topology_metrics_zk_path) # register a data watch for each def watchFunc(data, stat, event): #print("watch called") if event is not None and event.type == EventType.CHANGED: print("new sendgraph data for {0} at {1}".format(topology, byteArrayToInt(data))) schedule(zk, topology) return True # returning false will disable the watch # install data watch #DataWatch(zk, topology_metrics_zk_path, func=watchFunc) # if there is some data already, schedule immediately if len(zk.get_children(topology_metrics_zk_path)): print("existing sendgraph data for {0}".format(topology)) schedule(zk, topology)
def get_children_data(ensemble, namespace, read_only=True): hdfs = cluster.get_hdfs() if hdfs is None: raise PopupException(_('No [hdfs] configured in hue.ini.')) if hdfs.security_enabled: sasl_server_principal = PRINCIPAL_NAME.get() else: sasl_server_principal = None zk = KazooClient(hosts=ensemble, read_only=read_only, sasl_server_principal=sasl_server_principal) zk.start() children_data = [] children = zk.get_children(namespace) for node in children: data, stat = zk.get("%s/%s" % (namespace, node)) children_data.append(data) zk.stop() return children_data
def mkfs(args=None): args = mkfs_parser.parse_args(args) # Log verbosity verbosity = args.verbose - args.quiet log_level = logging.WARN - verbosity*10 logging.basicConfig(level=log_level) logging.getLogger('kazoo.client').setLevel(log_level + 20) # ZK Path of filesystem root zk_root = posixpath.join(FILESYSTEMS, args.name) # Zookeeper if len(args.servers): zk_hosts = ','.join(args.servers) else: zk_hosts = '127.0.0.1:2181' zk = KazooClient(hosts=zk_hosts) zk.start() # Run ClowderFS.mkfs(zk=zk, fs_root=zk_root, chunk_size=args.chunk_size) # Cleanup zk.stop()
def run(self): zk = KazooClient(hosts='%s:%d' % (self.options.host, self.options.port), read_only=True, timeout=3) try: zk.start() options = vars(self.options) options.update({ 'system.hostname': socket.gethostname() }) if self.options.regex: content, stats = zk.get(self.options.file) options['stats'] = stats m = re.search(self.options.regex, content, re.MULTILINE | re.DOTALL) if m: options.update(m.groupdict()) self.ok(self.options.message.format(**options)) else: self.critical(self.options.message.format(**options)) elif zk.exists(self.options.file): self.ok(self.options.message.format(**options)) else: self.critical(self.options.message.format(**options)) except Exception as ex: self.critical(ex) finally: zk.stop()
def processTransfer(): try: conn = psycopg2.connect(dbConnectStr) cur = conn.cursor() zk = KazooClient(hosts=zkHost) zk.start() transferq = LockingQueue(zk, '/transfer/') while True: rawCode = transferq.get() proposal = rawCode.decode().strip() transferq.consume() # print(" proposal = {0} ".format(proposal)) ints = datetime.now() inload = os.getloadavg()[0] pro1 = Popen(['/usr/bin/python36', './processproptran.py', proposal], stdin=None, stdout=None) pro1.wait() outts = datetime.now() outload = os.getloadavg()[0] # insert the runtime info into c* cluster = Cluster(cfg.cassCluster) session = cluster.connect(cfg.cassKeyspace) stmt = SimpleStatement("""insert into runstat(id,executable,ints,inload,outts,outload) values (%s, %s, %s, %s, %s, %s)""", consistency_level=ConsistencyLevel.ANY) session.execute(stmt, (uuid.uuid4(), executable, ints, inload, outts, outload)) except psycopg2.Error as err: print("SQLError {0}".format(err)) finally: zk.stop() zk.close() cur.close() conn.close()
import os import uuid import os import time import subprocess import threading import sqlite3 as sqlite3 from kazoo.client import KazooClient from kazoo.client import KazooState logging.basicConfig() new_master = 0 zk = KazooClient(hosts='zoo:2181', timeout=1.0) zk.start(timeout=1) # get cid, pid of container running this code cmd = "cat /proc/self/cgroup | grep 'docker' | sed 's/^.*\///' | tail -n1" cid = subprocess.check_output(cmd, shell=True) cid = cid.decode("utf-8") cid = cid[0:12] client2 = docker.APIClient() pid = client2.inspect_container(cid)['State']['Pid'] print("---PID---", pid) print("---CID---", cid) zk.ensure_path("/worker") if zk.exists("/worker/slave"): print("Slave exists") else:
#!/usr/bin/env python import pika import sys import json import sqlite3 import datetime from sqlalchemy import create_engine,and_,Column,Integer,String from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker import logging from kazoo.client import KazooClient logging.basicConfig() zk = KazooClient(hosts='zoo:2181') zk.start() zk.ensure_path("/slave") zk.create("/slave/"+ str(random()),b"",ephemeral=True) Base = declarative_base() class User(Base): __tablename__ = "User" username = Column("username",String(50), primary_key=True) password = Column("password",String(50), primary_key=True) class Rideshare(Base): __tablename__ = "Rideshare" rideid = Column("rideid",Integer, primary_key=True,autoincrement=True) created_by = Column("created_by",String(50)) timestamp = Column("timestamp",String(50)) source = Column("source",String(50)) dest = Column("dest",String(50))
def check(self, instance): consumer_groups = self.read_config(instance, 'consumer_groups', cast=self._validate_consumer_groups) zk_connect_str = self.read_config(instance, 'zk_connect_str') kafka_host_ports = self.read_config(instance, 'kafka_connect_str') # Construct the Zookeeper path pattern zk_prefix = instance.get('zk_prefix', '') zk_path_tmpl = zk_prefix + '/consumers/%s/offsets/%s/%s' # Connect to Zookeeper zk_conn = KazooClient(zk_connect_str, timeout=self.zk_timeout) zk_conn.start() try: # Query Zookeeper for consumer offsets consumer_offsets = {} topics = defaultdict(set) for consumer_group, topic_partitions in consumer_groups.iteritems(): for topic, partitions in topic_partitions.iteritems(): # Remember the topic partitions that we've see so that we can # look up their broker offsets later topics[topic].update(set(partitions)) for partition in partitions: zk_path = zk_path_tmpl % (consumer_group, topic, partition) try: consumer_offset = int(zk_conn.get(zk_path)[0]) key = (consumer_group, topic, partition) consumer_offsets[key] = consumer_offset except NoNodeError: self.log.warn('No zookeeper node at %s' % zk_path) except Exception: self.log.exception('Could not read consumer offset from %s' % zk_path) finally: try: zk_conn.stop() zk_conn.close() except Exception: self.log.exception('Error cleaning up Zookeeper connection') # Connect to Kafka kafka_conn = KafkaClient(kafka_host_ports, timeout=self.kafka_timeout) try: # Query Kafka for the broker offsets broker_offsets = {} for topic, partitions in topics.items(): offset_responses = kafka_conn.send_offset_request([ OffsetRequest(topic, p, -1, 1) for p in partitions]) for resp in offset_responses: broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') # Report the broker data for (topic, partition), broker_offset in broker_offsets.items(): broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] broker_offset = broker_offsets.get((topic, partition)) self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags) # Report the consumer for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): # Get the broker offset broker_offset = broker_offsets.get((topic, partition)) # Report the consumer offset and lag tags = ['topic:%s' % topic, 'partition:%s' % partition, 'consumer_group:%s' % consumer_group] self.gauge('kafka.consumer_offset', consumer_offset, tags=tags) self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, tags=tags)
class ZChunkserver: def __init__(self, zoo_ip='localhost:2181'): self.chunktable = {} self.chunkloc = None self.master = zerorpc.Client() self.zookeeper = KazooClient(zoo_ip) # register with zookeeper, get IP of master # TODO: need to add handling in case master is down here try: self.master_ip = self._register_with_zookeeper() print 'Chunkserver %d Connecting to master at %s' % (int( self.chunkloc), self.master_ip) self.master.connect(self.master_ip) except NoNodeError: print "No master record in zookeeper" raise # TODO handle shadow master/waiting for master to reconnect later except Exception as e: print "Unexpected error connecting to master:" print e.__doc__, e.message # local directory where chunks are stored self.local_filesystem_root = "/tmp/gfs/chunks/" #+ repr(int(self.chunkloc)) if not os.access(self.local_filesystem_root, os.W_OK): os.makedirs(self.local_filesystem_root) def _register_with_zookeeper(self): def my_listener(state): if state == KazooState.LOST or state == KazooState.SUSPENDED: print "suspended|lost state" # TODO connect to zookeeper again try: self.zookeeper.start() self.zookeeper.add_listener(my_listener) self.zookeeper.ensure_path('chunkserver') master_ip = self.zookeeper.get('master')[0].split('@')[-1] path = self.zookeeper.create('chunkserver/', ephemeral=True, sequence=True) self.chunkloc = path.replace('/chunkserver/', '') data = '{username}@{tcpip}'.format( username=getpass.getuser(), tcpip=zutils.get_tcp(4400 + int(self.chunkloc))) # self.zookeeper.set(path, zutils.get_tcp(4400 + int(self.chunkloc))) self.zookeeper.set(path, data) except Exception as e: print "Exception while registering with zookeeper: %s, %s" % ( type(e).__name__, e.args) return master_ip def print_name(self): """ Prints name to test connectivity """ print 'I am chunkserver #' + str(int(self.chunkloc)) self.master.answer_server(int(self.chunkloc)) def write(self, chunkuuid, chunk, forward=None): local_filename = self.chunk_filename(chunkuuid) try: with open(local_filename, "wb") as f: f.write(chunk) self.chunktable[chunkuuid] = local_filename except: return False #print "forward is ", forward if forward: print "Forwarding chunk to loc", forward self.send_chunk(chunkuuid, str([forward]), chunk) return xxhash.xxh64(chunk).digest() def close(self): self.master.close() @staticmethod def get_stats(): results = [] pattern = r' \d+[\.]?\d*' first = ['ifstat', '-q', '-i', 'enP0s3', '-S', '0.2', '1'] # get network traffic second = ['df', '/'] # get free space p1 = subprocess.Popen(first, stdout=subprocess.PIPE) p2 = subprocess.Popen(second, stdout=subprocess.PIPE) # get transfer speed and parse results transfer_speed = p1.communicate()[0] transfer_speed = re.findall(pattern, transfer_speed) results.append(sum([float(num) for num in transfer_speed])) # get storage info and parse results storage = p2.communicate()[0] storage = re.findall(r'\d+%', storage) # find entry with % results.append(int(storage[0][:-1])) # append entry without % return results ############################################################################## def rwrite(self, chunkuuid, chunk): local_filename = self.chunk_filename(chunkuuid) try: with open(local_filename, "wb") as f: f.write(chunk) self.chunktable[chunkuuid] = local_filename return True except: return False def read(self, chunkuuid): data = None local_filename = self.chunk_filename(chunkuuid) with open(local_filename, "rb") as f: data = f.read() return data def _establish_connection(self, chunkloc): chunkservers = self.master.get('chunkservers') zclient = zerorpc.Client() print 'Server connecting to chunkserver at %s' % chunkloc zclient.connect(chunkservers[chunkloc]) #zclient.print_name() return zclient def delete(self, chunkuuids): for chunkid in chunkuuids: filename = self.chunk_filename(chunkid) try: if os.path.exists(filename): print "Removing " + filename os.remove(filename) return True except: None def disp(self, a): print str(a) + str(self.chunkloc) def chunk_filename(self, chunkuuid): local_filename = self.local_filesystem_root + "/" + str( chunkuuid) + '.gfs' return local_filename def copy_chunk(self, chunkid, chunklocs): chunklocs = ast.literal_eval(chunklocs) flag = False for chunkloc in chunklocs: try: chunkserver = self._establish_connection(chunkloc) # TODO md5 check data = chunkserver.read(chunkid) flag = self.rwrite(chunkid, data) if flag: break except Exception as e: flag = False print "some error happend in copy_chunk", type( e).__name__, e.args return flag def send_chunk(self, chunkid, chunklocs, data): chunklocs = ast.literal_eval(chunklocs) flag = False for chunkloc in chunklocs: try: chunkserver = self._establish_connection(chunkloc) flag = chunkserver.rwrite(chunkid, data) if flag: break except Exception as e: flag = False self.master.print_exception('sending chunk', None, type(e).__name__) return flag def rename(self, chunkids, filename, newfilename): for chunkid in chunkids: local_filename = self.chunk_filename(chunkid) new_local_filename = local_filename.split('/') new_local_filename[-1] = new_local_filename[-1].replace( filename, newfilename) new_local_filename = '/'.join(new_local_filename) print "Changing %s to %s" % (local_filename, new_local_filename) try: os.rename(local_filename, new_local_filename) except: os.remove(new_local_filename) os.rename(local_filename, new_local_filename) return True def populate(self): #print "in populate, chunkloc=", self.chunkloc local_dir = self.chunk_filename("").replace(".gfs", "") #print "local dir is ", local_dir file_list = os.listdir(local_dir) if len(file_list) != 0: files = {} for items in file_list: # TODO # if master.exists # read all chunks (in parallel?) # if any xxhash is not the same, os.delete() # else add as regular items = items.replace(".gfs", "") filename = items.split("$%#")[0] self.chunktable[items] = self.chunk_filename(items) try: files[filename].append(items) except: files[filename] = [] files[filename].append(items) #print "files=%s, chunkloc=%s" % (files, self.chunkloc) # self.master.populate(files, str(self.chunkloc)) return files, self.chunkloc else: print "nothing to populate" return None, None
def zk_client(): zk_client = KazooClient(hosts=os.environ.get('ZK_HOST')) zk_client.start() real_client.stop() #rm_files_from(UPLOAD_DIR) yield zk_client
class ZookeeperDatabase(Database): # used as prefix for key, to namespace all queries hosts: List[str] namespace: str timeout: Numeric( 1, 60) = 5. # request timeout in seconds (tries another host) [s] ssl: Optional[SSL] = None def __post_init__(self): from kazoo.client import KazooClient if self.ssl: if isinstance(self.ssl.server_verify, str): self._client = KazooClient( hosts=self.hosts, timeout=self.timeout, use_ssl=True, verify_certs=True, ca=self.ssl.server_verify, certfile=self.ssl.client_cert_path, keyfile=self.ssl.client_key_path, ) elif isinstance(self.ssl.server_verify, bool): self._client = KazooClient( hosts=self.hosts, timeout=self.timeout, use_ssl=True, verify_certs=self.ssl.server_verify, certfile=self.ssl.client_cert_path, keyfile=self.ssl.client_key_path, ) else: raise ValidationError( 'SSL server verify must be type of Path or boolean!') else: self._client = KazooClient(hosts=self.hosts, timeout=self.timeout) self._client.start() def set(self, key: bytes, value: bytes): _validate_key(key) _validate_value(value) formatted_key = key.decode('ascii') full_path = os.path.join(self.namespace, formatted_key) self._client.ensure_path(full_path) self._client.set(full_path, value) def get(self, key: bytes) -> bytes: from kazoo.exceptions import NoNodeError _validate_key(key) formatted_key = key.decode('ascii') full_path = os.path.join(self.namespace, formatted_key) try: data = self._client.get(full_path) return bytes(data[0]) except NoNodeError: return None
#! /usr/bin/python # -*- coding:utf-8 -*- # @zhuchen : 2020/4/8 20:44 from kazoo.client import KazooClient client = KazooClient() client.start() class ZkHosts: go_host = [] python_host = [] zk_host = ZkHosts() @client.ChildrenWatch('/zhuchen/golang') def golang_watch(*args): print('golang update') hosts = args[0] if args else [] new_hosts = [] for host_name in hosts: d, _ = client.get(f'/zhuchen/golang/{host_name}') new_hosts.append(d.decode()) zk_host.go_host = new_hosts @client.DataWatch('/zhuchen/python')
class ZooKeeperJobStore(BaseJobStore): """ Stores jobs in a ZooKeeper tree. Any leftover keyword arguments are directly passed to kazoo's `KazooClient <http://kazoo.readthedocs.io/en/latest/api/client.html>`_. Plugin alias: ``zookeeper`` :param str path: path to store jobs in :param client: a :class:`~kazoo.client.KazooClient` instance to use instead of providing connection arguments :param int pickle_protocol: pickle protocol level to use (for serialization), defaults to the highest available """ def __init__(self, path='/apscheduler', client=None, close_connection_on_exit=False, pickle_protocol=pickle.HIGHEST_PROTOCOL, **connect_args): super().__init__() self.pickle_protocol = pickle_protocol self.close_connection_on_exit = close_connection_on_exit if not path: raise ValueError('The "path" parameter must not be empty') self.path = path if client: self.client = maybe_ref(client) else: self.client = KazooClient(**connect_args) self._ensured_path = False def _ensure_paths(self): if not self._ensured_path: self.client.ensure_path(self.path) self._ensured_path = True def start(self, scheduler, alias): super().start(scheduler, alias) if not self.client.connected: self.client.start() def lookup_job(self, job_id): self._ensure_paths() node_path = os.path.join(self.path, job_id) try: content, _ = self.client.get(node_path) doc = pickle.loads(content) job = self._reconstitute_job(doc['job_state']) return job except BaseException: return None def get_due_jobs(self, now): timestamp = datetime_to_utc_timestamp(now) jobs = [ job_def['job'] for job_def in self._get_jobs() if job_def['next_run_time'] is not None and job_def['next_run_time'] <= timestamp ] return jobs def get_next_run_time(self): next_runs = [ job_def['next_run_time'] for job_def in self._get_jobs() if job_def['next_run_time'] is not None ] return utc_timestamp_to_datetime( min(next_runs)) if len(next_runs) > 0 else None def get_all_jobs(self): jobs = [job_def['job'] for job_def in self._get_jobs()] self._fix_paused_jobs_sorting(jobs) return jobs def add_job(self, job): self._ensure_paths() node_path = os.path.join(self.path, str(job.id)) value = { 'next_run_time': datetime_to_utc_timestamp(job.next_run_time), 'job_state': job.__getstate__() } data = pickle.dumps(value, self.pickle_protocol) try: self.client.create(node_path, value=data) except NodeExistsError: raise ConflictingIdError(job.id) def update_job(self, job): self._ensure_paths() node_path = os.path.join(self.path, str(job.id)) changes = { 'next_run_time': datetime_to_utc_timestamp(job.next_run_time), 'job_state': job.__getstate__() } data = pickle.dumps(changes, self.pickle_protocol) try: self.client.set(node_path, value=data) except NoNodeError: raise JobLookupError(job.id) def remove_job(self, job_id): self._ensure_paths() node_path = os.path.join(self.path, str(job_id)) try: self.client.delete(node_path) except NoNodeError: raise JobLookupError(job_id) def remove_all_jobs(self): try: self.client.delete(self.path, recursive=True) except NoNodeError: pass self._ensured_path = False def shutdown(self): if self.close_connection_on_exit: self.client.stop() self.client.close() def _reconstitute_job(self, job_state): job_state = job_state job = Job.__new__(Job) job.__setstate__(job_state) job._scheduler = self._scheduler job._jobstore_alias = self._alias return job def _get_jobs(self): self._ensure_paths() jobs = [] failed_job_ids = [] all_ids = self.client.get_children(self.path) for node_name in all_ids: try: node_path = os.path.join(self.path, node_name) content, _ = self.client.get(node_path) doc = pickle.loads(content) job_def = { 'job_id': node_name, 'next_run_time': doc['next_run_time'] if doc['next_run_time'] else None, 'job_state': doc['job_state'], 'job': self._reconstitute_job(doc['job_state']), 'creation_time': _.ctime } jobs.append(job_def) except BaseException: self._logger.exception( 'Unable to restore job "%s" -- removing it' % node_name) failed_job_ids.append(node_name) # Remove all the jobs we failed to restore if failed_job_ids: for failed_id in failed_job_ids: self.remove_job(failed_id) paused_sort_key = datetime(9999, 12, 31, tzinfo=utc) return sorted(jobs, key=lambda job_def: (job_def['job'].next_run_time or paused_sort_key, job_def['creation_time'])) def __repr__(self): self._logger.exception('<%s (client=%s)>' % (self.__class__.__name__, self.client)) return '<%s (client=%s)>' % (self.__class__.__name__, self.client)
class ShellTestCase(unittest.TestCase): """ base class for all tests """ @classmethod def setUpClass(cls): get_global_cluster().start() def setUp(self): """ make sure that the prefix dir is empty """ self.tests_path = os.getenv("ZKSHELL_PREFIX_DIR", "/tests") self.zk_hosts = ",".join(server.address for server in get_global_cluster()) self.username = os.getenv("ZKSHELL_USER", "user") self.password = os.getenv("ZKSHELL_PASSWD", "user") self.digested_password = os.getenv("ZKSHELL_DIGESTED_PASSWD", "F46PeTVYeItL6aAyygIVQ9OaaeY=") self.super_password = os.getenv("ZKSHELL_SUPER_PASSWD", "secret") self.scheme = os.getenv("ZKSHELL_AUTH_SCHEME", "digest") self.client = KazooClient(self.zk_hosts, 5) self.client.start() self.client.add_auth(self.scheme, self.auth_id) if self.client.exists(self.tests_path): self.client.delete(self.tests_path, recursive=True) self.client.create(self.tests_path, str.encode("")) self.output = XStringIO() self.shell = Shell([self.zk_hosts], 5, self.output, setup_readline=False, async=False) # Create an empty test dir (needed for some tests) self.temp_dir = tempfile.mkdtemp() @property def auth_id(self): return "%s:%s" % (self.username, self.password) @property def auth_digest(self): return "%s:%s" % (self.username, self.digested_password) def tearDown(self): if self.output is not None: self.output.close() self.output = None if self.shell is not None: self.shell._disconnect() self.shell = None if os.path.isdir(self.temp_dir): shutil.rmtree(self.temp_dir) if self.client is not None: if self.client.exists(self.tests_path): self.client.delete(self.tests_path, recursive=True) self.client.stop() self.client.close() self.client = None ### # Helpers. ## def create_compressed(self, path, value): """ ZK Shell doesn't support creating directly from a bytes array so we use a Kazoo client to create a znode with zlib compressed content. """ compressed = zlib.compress(bytes(value, "utf-8") if PYTHON3 else value) self.client.create(path, compressed, makepath=True)
def check(self, instance): """ Check offset in kafka for consumer_groups,topics and partitions. Alt 1; You can ether specify consumer_groups, topics and partitions in config file like consumer_groups: my_consumer: my_topic: [0, 1, 4, 12] Alt 2; Ask zookeeper for the current configuration and use that, it will do this if no consumer_groups is specifyed in configuration. """ zk_connect_str = self.read_config(instance, 'zk_connect_str') kafka_host_ports = self.read_config(instance, 'kafka_connect_str') # Construct the Zookeeper path pattern zk_prefix = instance.get('zk_prefix', '') # Connect to Zookeeper zk_conn = KazooClient(zk_connect_str) zk_conn.start() try: if instance.has_key('consumer_groups'): #Alt1, Only check the given consumer groups, topics and partions. consumer_groups = self.read_config( instance, 'consumer_groups', cast=self._validate_consumer_groups) (consumer_offsets, topics) = \ self._get_offsets_based_on_config(zk_conn, zk_prefix, consumer_groups) else: #Alt2, Non given lets ask zookeeper for a full set. (consumer_offsets, topics) = \ self._get_offsets_from_zk(zk_conn, zk_prefix) finally: try: zk_conn.stop() zk_conn.close() except Exception: self.log.exception('Error cleaning up Zookeeper connection') # Connect to Kafka kafka_conn = KafkaClient(kafka_host_ports) try: # Query Kafka for the broker offsets broker_offsets = {} for topic, partitions in topics.items(): offset_responses = kafka_conn.send_offset_request( [OffsetRequest(topic, p, -1, 1) for p in partitions]) for resp in offset_responses: broker_offsets[(resp.topic, resp.partition)] = resp.offsets[0] finally: try: kafka_conn.close() except Exception: self.log.exception('Error cleaning up Kafka connection') # Report the broker data for (topic, partition), broker_offset in broker_offsets.items(): broker_tags = ['topic:%s' % topic, 'partition:%s' % partition] broker_offset = broker_offsets.get((topic, partition)) self.gauge('kafka.broker_offset', broker_offset, tags=broker_tags) # Report the consumer for (consumer_group, topic, partition), consumer_offset in consumer_offsets.items(): # Get the broker offset broker_offset = broker_offsets.get((topic, partition)) # Report the consumer offset and lag tags = [ 'topic:%s' % topic, 'partition:%s' % partition, 'consumer_group:%s' % consumer_group ] self.gauge('kafka.consumer_offset', consumer_offset, tags=tags) self.gauge('kafka.consumer_lag', broker_offset - consumer_offset, tags=tags)
def get_kazoo_client(self, zoo_instance_name): zk = KazooClient(hosts=self.get_instance_ip(zoo_instance_name)) zk.start() return zk
class PartitionClient(object): """ Client Class for the Partition Library Example usage: --------------------- import libpartition from libpartition.libpartition import PartitionClient def own_change_cb(l): print "ownership change:" + str(l) c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, own_change_cb, "zookeeper_s1") ##do some real work now" if (c.own_partition(1)): ...... do something with partition #1 ..... ......... ... c.update_cluster_list(["s1", "s2"]) ... ---------------------- You should not call any partition library routine from within the callback function Args: app_name(str): Name of the app for which partition cluster is used self_name(str): Name of the local cluster node (can be ip address) cluster_list(list): List of all the nodes in the cluster including local node max_partition(int): Partition space always go from 0..max_partition-1 partition_update_cb: Callback function invoked when partition ownership list is updated.x zk_server(str): <zookeeper server>:<zookeeper server port> """ def __init__( self, app_name, self_name, cluster_list, max_partition, partition_update_cb, zk_server, logger = None): # Initialize local variables self._zk_server = zk_server self._cluster_list = set(cluster_list) self._max_partition = max_partition self._update_cb = partition_update_cb self._curr_part_ownership_list = [] self._target_part_ownership_list = [] self._con_hash = ConsistentHash(cluster_list) self._name = self_name # some sanity check if not(self._name in cluster_list): raise ValueError('cluster list is missing local server name') # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') # connect to zookeeper self._zk = KazooClient(zk_server) while True: try: self._zk.start() break except gevent.event.Timeout as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Zookeeper is also throwing exception due to delay in master election except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Update connection info self._sandesh_connection_info_update(status='UP', message='') # Done connecting to ZooKeeper # create a lock array to contain locks for each partition self._part_locks = [] for part in range(0, self._max_partition): lockpath = "/lockpath/"+ app_name + "/" + str(part) l = self._zk.Lock(lockpath, self._name) self._part_locks.append(l) # initialize partition # to lock acquire greenlet dictionary self._part_lock_task_dict = {} self._logger.error("initial servers:" + str(self._cluster_list)) # update target partition ownership list for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end __init__ def _sandesh_connection_info_update(self, status, message): from pysandesh.connection_info import ConnectionState from pysandesh.gen_py.process_info.ttypes import ConnectionStatus, \ ConnectionType from pysandesh.gen_py.sandesh.ttypes import SandeshLevel new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER, name = 'Zookeeper', status = new_conn_state, message = message, server_addrs = self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' %(message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update # following routine is the greenlet task function to acquire the lock # for a partition def _acquire_lock(self, part): # lock for the partition l = self._part_locks[part] # go in an infinite loop waiting to acquire the lock try: while True: ret = l.acquire(blocking=False) if ret == True: self._logger.error("Acquired lock for:" + str(part)) self._curr_part_ownership_list.append(part) self._update_cb(self._curr_part_ownership_list) return True else: gevent.sleep(1) except CancelledError: self._logger.error("Lock acquire cancelled for:" + str(part)) return False except Exception as ex: # TODO: If we have a non-KazooException, the lock object # may get stuck in the "cancelled" state self._logger.error("Lock acquire unexpected error!: " + str(ex)) assert() return False #end _acquire_lock # get rid of finished spawned tasks from datastructures def _cleanup_greenlets(self): for part in self._part_lock_task_dict.keys(): if (self._part_lock_task_dict[part].ready()): del self._part_lock_task_dict[part] #end _cleanup_greenlets # following routine launches tasks to acquire partition locks def _acquire_partition_ownership(self): # cleanup any finished greenlets self._cleanup_greenlets() # this variable will help us decide if we need to call callback updated_curr_ownership = False # list of partitions for which locks have to be released release_lock_list = [] self._logger.error("known servers: %s" % self._con_hash.get_all_nodes()) for part in range(0, self._max_partition): if (part in self._target_part_ownership_list): if (part in self._curr_part_ownership_list): # do nothing, I already have ownership of this partition self._logger.error("No need to acquire ownership of:" + str(part)) else: # I need to acquire lock for this partition before I own if (part in self._part_lock_task_dict.keys()): try: self._part_lock_task_dict[part].get(block=False) except: # do nothing there is already a greenlet running to # acquire the lock self._logger.error("Already a greenlet running to" " acquire:" + str(part)) continue # Greenlet died without getting ownership. Cleanup self._logger.error("Cleanup stale greenlet running to" " acquire:" + str(part)) del self._part_lock_task_dict[part] self._logger.error("Starting greenlet running to" " acquire:" + str(part)) # launch the greenlet to acquire the loc, k g = Greenlet.spawn(self._acquire_lock, part) self._part_lock_task_dict[part] = g else: # give up ownership of the partition # cancel any lock acquisition which is ongoing if (part in self._part_lock_task_dict.keys()): try: self._part_lock_task_dict[part].get(block=False) except: self._logger.error("canceling lock acquisition going on \ for:" + str(part)) # Cancelling the lock should result in killing the gevent self._part_locks[part].cancel() self._part_lock_task_dict[part].get(block=True) del self._part_lock_task_dict[part] if (part in self._curr_part_ownership_list): release_lock_list.append(part) self._curr_part_ownership_list.remove(part) updated_curr_ownership = True self._logger.error("giving up ownership of:" + str(part)) if (updated_curr_ownership is True): # current partition membership was updated call the callback self._update_cb(self._curr_part_ownership_list) if (len(release_lock_list) != 0): # release locks which were acquired for part in release_lock_list: self._logger.error("release the lock which was acquired:" + \ str(part)) try: self._part_locks[part].release() self._logger.error("fully gave up ownership of:" + str(part)) except: pass #end _acquire_partition_ownership def update_cluster_list(self, cluster_list): """ Updates the cluster node list Args: cluster_list(list): New list of names of the nodes in the cluster Returns: None """ # some sanity check if not(self._name in cluster_list): raise ValueError('cluster list is missing local server name') new_cluster_list = set(cluster_list) new_servers = list(new_cluster_list.difference( self._cluster_list)) deleted_servers = list(set(self._cluster_list).difference( new_cluster_list)) self._cluster_list = set(cluster_list) self._logger.error("deleted servers:" + str(deleted_servers)) self._logger.error("new servers:" + str(new_servers)) # update the hash structure if new_servers: self._con_hash.add_nodes(new_servers) if deleted_servers: self._con_hash.del_nodes(deleted_servers) # update target partition ownership list self._target_part_ownership_list = [] for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): if not (part in self._target_part_ownership_list): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end update_cluster_list def own_partition(self, part_no): """ Returns ownership information of a partition Args: part_no(int) : Partition no Returns: True if partition is owned by the local node False if partition is not owned by the local node """ return part_no in self._curr_part_ownership_list #end own_partition def close(self): """ Closes any connections and frees up any data structures Args: Returns: None """ # clean up greenlets for part in self._part_lock_task_dict.keys(): try: self._part_lock_task_dict[part].kill() except: pass # close zookeeper try: self._zk.stop() except: pass try: self._zk.close() except: pass
def spoorer(self): #连接kafka,获取topics try: kafka_client = SimpleClient(self.kafka_hosts, timeout=self.timeout) # print kafka_client.topics except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: kafka_topics = kafka_client.topics finally: kafka_client.close() #连接zk,获取当前消费进度current offset try: zookeeper_client = KazooClient(hosts=self.zookeeper_hosts, read_only=True, timeout=self.timeout) zookeeper_client.start() except Exception as e: print "Error, cannot connect zookeeper server." sys.exit(1) try: groups = map(str,zookeeper_client.get_children(self.zookeeper_url + 'consumers')) except NoNodeError as e: print "Error, invalid zookeeper url." zookeeper_client.stop() sys.exit(2) else: for group in groups: print group if 'offsets' not in zookeeper_client.get_children(self.zookeeper_url + 'consumers/%s' % group):continue topic_path = 'consumers/%s/offsets' % (group) print 22 topics = map(str,zookeeper_client.get_children(self.zookeeper_url + topic_path)) if len(topics) == 0: continue for topic in topics: if topic not in self.white_topic_group.keys(): continue elif group not in self.white_topic_group[topic].replace(' ','').split(','): continue partition_path = 'consumers/%s/offsets/%s' % (group,topic) partitions = map(int,zookeeper_client.get_children(self.zookeeper_url + partition_path)) for partition in partitions: base_path = 'consumers/%s/%s/%s/%s' % (group, '%s', topic, partition) owner_path, offset_path = base_path % 'owners', base_path % 'offsets' offset = zookeeper_client.get(self.zookeeper_url + offset_path)[0] try: owner = zookeeper_client.get(self.zookeeper_url + owner_path)[0] except NoNodeError as e: owner = 'null' #消费进度放在字典metric中 metric = {'datetime':time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'topic':topic, 'group':group, 'partition':int(partition), 'logsize':None, 'offset':int(offset), 'lag':None, 'owner':owner} self.result.append(metric) print "ok" finally: zookeeper_client.stop() #获取每个分片的logsize(此处和原文不一样,做了修改) try: client = SimpleClient(self.kafka_hosts) except Exception as e: print "Error, cannot connect kafka broker." sys.exit(1) else: for kafka_topic in kafka_topics: self.kafka_logsize[kafka_topic] = {} partitions = client.topic_partitions[kafka_topic] offset_requests = [OffsetRequestPayload(kafka_topic, p, -1, 1) for p in partitions.keys()] offsets_responses = client.send_offset_request(offset_requests) for r in offsets_responses: self.kafka_logsize[kafka_topic][r.partition] = r.offsets[0] #logsize减去current offset等于lag f1 = open(self.log_file,'a+') f2 = open(self.log_day_file,'a+') str1 = "hello" print 0 # print self.result for metric in self.result: logsize = self.kafka_logsize[metric['topic']][metric['partition']] metric['logsize'] = int(logsize) metric['lag'] = int(logsize) - int(metric['offset']) f1.write(json.dumps(metric,sort_keys=True) + '\n') f1.write(str1) f1.flush() f2.write(json.dumps(metric,sort_keys=True) + '\n') f2.flush() # finally: client.close() print 3 return ''
class Publisher: # instantiate variables and connect to broker def __init__(self, ip_add, name=""): if name == "": self.name = ''.join(random.choices(string.ascii_uppercase + string.digits, k = 8)) else: self.name = name self.history = "" self.kill = True self.topic = "Default" #self.full_add = "tcp://" + str(ip_add) + ":1234" self.context = zmq.Context() self.full_add = "" self.sock_pub = self.context.socket(zmq.PUB) #PRESIDENT ZNODE ADDRESS self.home = "/president/pres" self.zk_driver = KazooClient(hosts='127.0.0.1:2181') self.zk_driver.start() data, stat = self.zk_driver.get(self.home) ports = data.decode('ASCII').split(":") self.full_add = "tcp://" + str(ip_add) + ":" + ports[0] self.sock_pub.connect(self.full_add) # register a topic for this publisher def register_pub(self, topic): self.topic = topic msg = "REGISTER||" + str(self.topic) + "||" + str(self.name) ### CREATE ZNODE node_path = '/' + str(self.topic) + '/' + str(self.name) self.zk_driver.ensure_path('/' + str(self.topic) + '/') if not self.zk_driver.exists(node_path): self.zk_driver.create(node_path, b'0') time.sleep(1) print("Pub ID = ", self.name) self.sock_pub.send_string(msg) return True # publish the given information for pre-registered topic def publish(self, info): self.history = self.history + str(info) + "..." # format for published string is "topic||info" msg = str(self.topic) + "||" + self.history + "||" + str(self.name) #print("Time published: %.20f" % time.time()) # uncomment for measurements purposes self.sock_pub.send_string(msg) @self.zk_driver.DataWatch(self.home) def watch_node(data, stat, event): if event is not None and event.type == "CREATED" and self.kill: # DISCONNECT self.sock_pub.close() self.context.term() self.context = zmq.Context() self.sock_pub = self.context.socket(zmq.PUB) # RECONNECT WITH NEW PORT data, stat = self.zk_driver.get(self.home) ports = data.decode('ASCII').split(":") self.full_add = "tcp://" + str(ip_add) + ":" + ports[0] self.sock_pub.connect(self.full_add) self.kill = False print("Updated Broker! Input information about your topic and press enter to publish!") return True
class Publish(object): _to_zip_node = dict() _to_syc_node = dict() _to_pub_node = dict() _server_list = dict() _root_node = '' _zookeeper = None def __init__(self, host = '127.0.0.1', port = 2181, root_node = '/jzqps'): self._root_node = root_node if root_node[0] == '/' else '/jzgps' self._zookeeper = KazooClient('%s:%s' % (host, port,)) self._zookeeper.start() default_node = [ self._root_node, self._root_node + '/server_list', self._root_node + '/to_zip_notice', self._root_node + '/to_zip_result', self._root_node + '/to_syc_notice', self._root_node + '/to_syc_result', self._root_node + '/to_pub_notice', self._root_node + '/to_pub_result', self._root_node + '/to_rol_notice', self._root_node + '/to_rol_result', ] default_node_value = json.dumps({'update_time' : Tools.g_time()}) try: for node in default_node: if self._zookeeper.exists(node) is None: self._zookeeper.create(node, default_node_value, makepath = True) except kazoo.exceptions.NodeExistsError: pass def server(self, server_node, now_timestamp): server_detail = self._zookeeper.get('%s/server_list/%s' % (self._root_node, server_node, )) if 0 != len(server_detail[0]): tmp_server_detail = json.loads(server_detail[0]) if tmp_server_detail['update_time'] + 10 > now_timestamp: self._server_list[server_node] = tmp_server_detail elif self._server_list.get(server_node, None) is not None: del self._server_list[server_node] return self._server_list.get(server_node, None) def get_pub_node_id(self, pub_id): return 'v%s' % pub_id def get_server_list(self): server_list = [] server_node = self._zookeeper.get_children('/test/server_list/') if len(server_node): now_timestamp = time.time() for s in sorted(server_node): if self.server(s, now_timestamp) is not None: server_list.append(self._server_list[s]) return server_list def to_zip(self, pub_id, zip_callback = None, **ext_data): pub_node_id = self.get_pub_node_id(pub_id) ext_data['pub_id'] = pub_id ext_data['pub_node_id'] = pub_node_id ext_data['update_time'] = Tools.g_time() try: if self._zookeeper.exists(self._root_node + '/to_zip_notice/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_zip_notice/' + pub_node_id, json.dumps(ext_data), makepath = True) else: self._zookeeper.set(self._root_node + '/to_zip_notice/' + pub_node_id, json.dumps(ext_data)) if self._zookeeper.exists(self._root_node + '/to_zip_result/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_zip_result/' + pub_node_id, '', makepath = True) else: self._zookeeper.set(self._root_node + '/to_zip_result/' + pub_node_id, '') except kazoo.exceptions.NodeExistsError: pass if self._to_zip_node.get(pub_node_id, None) is None: self._to_zip_node[pub_node_id] = [zip_callback] self.zip_notice(pub_id, pub_node_id) else: self._to_zip_node[pub_node_id].append(zip_callback) return self def zip_notice(self, pub_id, pub_node_id): @self._zookeeper.DataWatch('%s/to_zip_result/%s' % (self._root_node, pub_node_id, )) def to_zip_notice(data, stat, event): if 0 == len(data) or \ event is None \ or event.type == 'CREATED' \ or event.type == 'DELETED': return LOG.info('%s/to_zip_result/%s changed %s' % (self._root_node, pub_node_id, data, )) for zip_callback in self._to_zip_node[pub_node_id]: zip_callback(data) self._to_zip_node[pub_node_id] = [] return self def to_syc(self, pub_id, target_servers, syc_process_callback=None, syc_success_callback = None, **ext_data): pub_node_id = self.get_pub_node_id(pub_id) ext_data['pub_id'] = pub_id ext_data['pub_node_id'] = pub_node_id ext_data['update_time'] = Tools.g_time() ext_data['servers'] = target_servers try: if self._zookeeper.exists(self._root_node + '/to_syc_notice/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_syc_notice/' + pub_node_id, json.dumps(ext_data), makepath = True) else: self._zookeeper.set(self._root_node + '/to_syc_notice/' + pub_node_id, json.dumps(ext_data)) if self._zookeeper.exists(self._root_node + '/to_syc_result/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_syc_result/' + pub_node_id, '', makepath = True) else: self._zookeeper.set(self._root_node + '/to_syc_result/' + pub_node_id, '') for target_server_id in target_servers: target_node = self._root_node + '/to_syc_result/' + pub_node_id + '/s' + str(target_server_id) if self._zookeeper.exists(target_node) is not None: self._zookeeper.delete(target_node) except kazoo.exceptions.NodeExistsError: pass if self._to_syc_node.get(pub_node_id, None) is None: self._to_syc_node[pub_node_id] = { 'callback' : [syc_process_callback, syc_success_callback], 'servers' : target_servers, 'notices' : [], 'results' : {}, 'update_time' : Tools.g_time() } self.syc_children_notice(pub_id, pub_node_id) else : self._to_syc_node[pub_node_id]['callback'] = [syc_process_callback, syc_success_callback] self._to_syc_node[pub_node_id]['servers'] = target_servers self._to_syc_node[pub_node_id]['results'] = {} self._to_syc_node[pub_node_id]['time'] = Tools.g_time() return self def syc_children_notice(self, pub_id, pub_node_id): @self._zookeeper.ChildrenWatch('%s/to_syc_result/%s' % (self._root_node, pub_node_id, )) def to_syc_process(server_list): for server_node in server_list: if server_node not in self._to_syc_node[pub_node_id]['notices']: self._to_syc_node[pub_node_id]['notices'].append(server_node) self.syc_process_notice(pub_id, pub_node_id, server_node) return self def syc_process_notice(self, pub_id, pub_node_id, server_node): syc_server_node = '%s/to_syc_result/%s/%s' % (self._root_node, pub_node_id, server_node, ) @self._zookeeper.DataWatch(syc_server_node) def to_syc_process(data, stat, event): if event is not None and event.type == 'DELETED': return if 0 == len(data): return LOG.info('syc children %s %s' % (syc_server_node, data, )) syc_detail = json.loads(data) if isinstance(syc_detail, dict) == False or \ syc_detail.get('update_time', None) is None or \ syc_detail.get('status', None) is None: return if syc_detail['status'] == 'ok': self._to_syc_node[pub_node_id]['results'][server_node] = True else: self._to_syc_node[pub_node_id]['results'][server_node] = False self._to_syc_node[pub_node_id]['callback'][0](server_node, data) all_syc_finished = True if len(self._to_syc_node[pub_node_id]['servers']) > 0 else False for server_id in self._to_syc_node[pub_node_id]['servers']: target_server_node = 's%s' % server_id if self._to_syc_node[pub_node_id]['results'].get(target_server_node, False) is False: all_syc_finished = False break if all_syc_finished: self._to_syc_node[pub_node_id]['callback'][1]() self._to_syc_node[pub_node_id]['callback'] = [] self._to_syc_node[pub_node_id]['results'] = {} self._zookeeper.set('%s/to_syc_notice/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'pub_id' : pub_id, 'pub_node_id' : pub_node_id, 'update_time' : self._to_syc_node[pub_node_id]['update_time'], 'servers' : self._to_syc_node[pub_node_id]['servers'], 'finish_time' : Tools.g_time(), 'status' : 'ok' })) self._zookeeper.set('%s/to_syc_result/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'update_time' : Tools.g_time(), 'status' : 'ok' })) return self def to_pub(self, pub_id, target_servers, pub_process_callback=None, pub_success_callback = None, **ext_data): pub_node_id = self.get_pub_node_id(pub_id) ext_data['pub_id'] = pub_id ext_data['pub_node_id'] = pub_node_id ext_data['update_time'] = Tools.g_time() ext_data['servers'] = target_servers try: if self._zookeeper.exists(self._root_node + '/to_pub_notice/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_pub_notice/' + pub_node_id, json.dumps(ext_data), makepath = True) else: self._zookeeper.set(self._root_node + '/to_pub_notice/' + pub_node_id, json.dumps(ext_data)) if self._zookeeper.exists(self._root_node + '/to_pub_result/' + pub_node_id) is None: self._zookeeper.create(self._root_node + '/to_pub_result/' + pub_node_id, '', makepath = True) else: self._zookeeper.set(self._root_node + '/to_pub_result/' + pub_node_id, '') for target_server_id in target_servers: target_node = self._root_node + '/to_pub_result/' + pub_node_id + '/s' + str(target_server_id) if self._zookeeper.exists(target_node) is not None: self._zookeeper.delete(target_node) except kazoo.exceptions.NodeExistsError: pass if self._to_pub_node.get(pub_node_id, None) is None: self._to_pub_node[pub_node_id] = { 'callback' : [pub_process_callback, pub_success_callback], 'servers' : target_servers, 'notices' : [], 'results' : {}, 'update_time' : Tools.g_time() } self.pub_children_notice(pub_id, pub_node_id) else : self._to_pub_node[pub_node_id]['callback'] = [pub_process_callback, pub_success_callback] self._to_pub_node[pub_node_id]['servers'] = target_servers self._to_pub_node[pub_node_id]['results'] = {} self._to_pub_node[pub_node_id]['time'] = Tools.g_time() return self def pub_children_notice(self, pub_id, pub_node_id): @self._zookeeper.ChildrenWatch('%s/to_pub_result/%s' % (self._root_node, pub_node_id, )) def to_pub_process(server_list): for server_node in server_list: if server_node not in self._to_pub_node[pub_node_id]['notices']: self._to_pub_node[pub_node_id]['notices'].append(server_node) self.pub_process_notice(pub_id, pub_node_id, server_node) return self def pub_process_notice(self, pub_id, pub_node_id, server_node): pub_server_node = '%s/to_pub_result/%s/%s' % (self._root_node, pub_node_id, server_node, ) @self._zookeeper.DataWatch(pub_server_node) def to_pub_process(data, stat, event): if event is not None and event.type == 'DELETED': return if 0 == len(data): return LOG.info('pub children %s %s' % (pub_server_node, data, )) pub_detail = json.loads(data) if isinstance(pub_detail, dict) == False or \ pub_detail.get('update_time', None) is None or \ pub_detail.get('status', None) is None: return if pub_detail['status'] == 'ok': self._to_pub_node[pub_node_id]['results'][server_node] = True else: self._to_pub_node[pub_node_id]['results'][server_node] = False self._to_pub_node[pub_node_id]['callback'][0](server_node, data) all_pub_finished = True if len(self._to_pub_node[pub_node_id]['servers']) > 0 else False for server_id in self._to_pub_node[pub_node_id]['servers']: target_server_node = 's%s' % server_id if self._to_pub_node[pub_node_id]['results'].get(target_server_node, False) is False: all_pub_finished = False break if all_pub_finished: self._to_pub_node[pub_node_id]['callback'][1]() self._to_pub_node[pub_node_id]['callback'] = [] self._to_pub_node[pub_node_id]['results'] = {} self._zookeeper.set('%s/to_pub_notice/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'pub_id' : pub_id, 'pub_node_id' : pub_node_id, 'update_time' : self._to_pub_node[pub_node_id]['update_time'], 'servers' : self._to_pub_node[pub_node_id]['servers'], 'finish_time' : Tools.g_time(), 'status' : 'ok' })) self._zookeeper.set('%s/to_pub_result/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'update_time' : Tools.g_time(), 'status' : 'ok' })) return self def deprecated(self, pub_id): pub_node_id = self.get_pub_node_id(pub_id) if self._zookeeper.exists(self._root_node + '/to_syc_notice/' + pub_node_id): self._zookeeper.set('%s/to_syc_notice/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'pub_id' : pub_id, 'pub_node_id' : pub_node_id, 'update_time' : Tools.g_time(), 'servers' : [], 'finish_time' : Tools.g_time(), 'status' : 'deprecated' })) if self._zookeeper.exists(self._root_node + '/to_pub_notice/' + pub_node_id): self._zookeeper.set('%s/to_pub_notice/%s' % (self._root_node, pub_node_id, ), json.dumps({ 'pub_id' : pub_id, 'pub_node_id' : pub_node_id, 'update_time' : Tools.g_time(), 'servers' : [], 'finish_time' : Tools.g_time(), 'status' : 'deprecated' }))
class ZooKeeper(object): # Constants used by the REST API: LIVE_NODES_ZKNODE = '/live_nodes' ALIASES = '/aliases.json' CLUSTER_STATE = '/clusterstate.json' SHARDS = 'shards' REPLICAS = 'replicas' STATE = 'state' ACTIVE = 'active' LEADER = 'leader' BASE_URL = 'base_url' TRUE = 'true' FALSE = 'false' COLLECTION = 'collection' COLLECTION_STATE = '/collections/{}/state.json' NODE_NAME = 'node_name' def __init__(self, zkServerAddress, zkClientTimeout=15, zkClientConnectTimeout=15): if KazooClient is None: logging.error( 'ZooKeeper requires the `kazoo` library to be installed') raise RuntimeError self.watchedCollections = [] self.collections = {} self.liveNodes = {} self.aliases = {} self.state = None self.hasClusterState = False self.zk = KazooClient(zkServerAddress, read_only=True) self.zk.start() random.seed() def connectionListener(state): if state == KazooState.LOST: self.state = state elif state == KazooState.SUSPENDED: self.state = state self.zk.add_listener(connectionListener) @self.zk.DataWatch(ZooKeeper.CLUSTER_STATE) def watchClusterState(data, *args, **kwargs): if not data: LOG.warning( "No cluster state available: no collections defined?") else: self.collections = json.loads(data.decode('utf-8')) self.hasClusterState = True LOG.info('Updated collections: %s', self.collections) @self.zk.ChildrenWatch(ZooKeeper.LIVE_NODES_ZKNODE) def watchLiveNodes(children): self.liveNodes = children LOG.info("Updated live nodes: %s", children) @self.zk.DataWatch(ZooKeeper.ALIASES) def watchAliases(data, stat): if data: json_data = json.loads(data.decode('utf-8')) if ZooKeeper.COLLECTION in json_data and json_data[ ZooKeeper.COLLECTION]: self.aliases = json_data[ZooKeeper.COLLECTION] else: LOG.warning('Expected to find %s in alias update %s', ZooKeeper.COLLECTION, json_data.keys()) else: self.aliases = {} LOG.info("Updated aliases: %s", self.aliases) def watchCollection(self, collection): path = ZooKeeper.COLLECTION_STATE.format(collection) def watch(event=None): data = self.zk.get(path, watch=watch) self.collections[collection] = json.loads( data[0].decode("utf8"))[collection] try: watch() except NoNodeError as e: if (self.hasClusterState and collection not in self.collections) or not self.hasClusterState: raise SolrError("No collection %s" % collection) def __del__(self): # Avoid leaking connection handles in Kazoo's atexit handler: self.zk.stop() self.zk.close() def getHosts(self, collname, only_leader=False, seen_aliases=None): if self.aliases and collname in self.aliases: return self.getAliasHosts(collname, only_leader, seen_aliases) hosts = [] if collname not in self.collections: raise SolrError("Unknown collection: %s", collname) collection = self.collections[collname] shards = collection[ZooKeeper.SHARDS] for shardname in shards.keys(): shard = shards[shardname] if shard[ZooKeeper.STATE] == ZooKeeper.ACTIVE: replicas = shard[ZooKeeper.REPLICAS] for replicaname in replicas.keys(): replica = replicas[replicaname] if replica[ZooKeeper.STATE] == ZooKeeper.ACTIVE: if not only_leader or (replica.get( ZooKeeper.LEADER, None) == ZooKeeper.TRUE): base_url = replica[ZooKeeper.BASE_URL] if base_url not in hosts: if replica[ ZooKeeper.NODE_NAME] in self.liveNodes: hosts.append(base_url) return hosts def getAliasHosts(self, collname, only_leader, seen_aliases): if seen_aliases: if collname in seen_aliases: LOG.warn("%s in circular alias definition - ignored", collname) return [] else: seen_aliases = [] seen_aliases.append(collname) collections = self.aliases[collname].split(",") hosts = [] for collection in collections: for host in self.getHosts(collection, only_leader, seen_aliases): if host not in hosts: hosts.append(host) return hosts def getRandomURL(self, collname): hosts = self.getHosts(collname, only_leader=False) if len(hosts) == 0: raise SolrError("No hosts available for %s" % collname) return random.choice(hosts) + "/" + collname def getLeaderURL(self, collname): hosts = self.getHosts(collname, only_leader=True) if len(hosts) == 0: raise SolrError("No leaders available for %s" % collname) return random.choice(hosts) + "/" + collname
def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) _fake_zk_instance.start() return _fake_zk_instance
class mock_syc(object): _root_node = '' _server_list = {} _zookeeper = None _shell_path = '' def __init__(self, host='127.0.0.1', port=2181, root_node='/test', shell_path='./'): self._zookeeper = KazooClient('%s:%s' % ( host, port, )) self._root_node = root_node self._shell_path = shell_path def run(self): self._zookeeper.start() self.init() def init(self): syc_node = '%s/to_syc_notice' % self._root_node default_node_value = json.dumps({'update_time': time.time()}) try: if self._zookeeper.exists(syc_node) is None: self._zookeeper.create(syc_node, default_node_value, makepath=True) except kazoo.exceptions.NodeExistsError: pass @self._zookeeper.ChildrenWatch('%s/server_list' % (self._root_node, )) def server(server_list): for server_node in server_list: result = self.init_server(server_node) LOG.info('refresh server list %s' % json.dumps(result)) @self._zookeeper.ChildrenWatch('%s/to_syc_notice' % (self._root_node, ) ) def to_syc_node(syc_node_list): for syc_node_id in syc_node_list: LOG.info('watch_syc children %s/to_syc_notice/%s' % ( self._root_node, syc_node_id, )) self.to_syc(syc_node_id) return self def init_server(self, server_node): server_detail = self._zookeeper.get('%s/server_list/%s' % ( self._root_node, server_node, )) if 0 == len(server_detail[0]): self._server_list[server_node] = { 'server_id': 0, 'server_name': '', 'update_time': 0 } else: self._server_list[server_node] = json.loads(server_detail[0]) return self._server_list[server_node] def to_syc(self, syc_node_id): @self._zookeeper.DataWatch('%s/to_syc_notice/%s' % ( self._root_node, syc_node_id, )) def to_zip_execute(data, stat, event): if event is not None and event.type == 'DELETED': return if 0 == len(data): return LOG.info('watch_syc execute %s/to_syc_notice/%s %s' % ( self._root_node, syc_node_id, data, )) node_detail = json.loads(data) if node_detail.get('status', None) == 'ok' or \ node_detail.get('status', None) == 'failed' or \ node_detail.get('servers', None) is None: return all_syc_finished = True for server_index in self._server_list: if 0 == self._server_list[server_index]['server_id'] or \ str(self._server_list[server_index]['server_id']) not in node_detail['servers']: continue node_value = {'update_time': time.time()} if self.syc_execute( node_detail['config_version'], node_detail['game_version'], self._server_list[server_index]['server_id']) is True: LOG.info('syc node %s/to_syc_result/%s/s%s syc success' % (self._root_node, syc_node_id, self._server_list[server_index]['server_id'])) node_value['status'] = 'ok' else: LOG.info('syc node %s/to_syc_result/%s/s%s syc failed' % (self._root_node, syc_node_id, self._server_list[server_index]['server_id'])) node_value['status'] = 'failed' all_syc_finished = False syc_server_node = '%s/to_syc_result/%s/s%s' % ( self._root_node, syc_node_id, self._server_list[server_index]['server_id'], ) try: if self._zookeeper.exists(syc_server_node) is None: self._zookeeper.create(syc_server_node, json.dumps(node_value), makepath=True) else: self._zookeeper.set(syc_server_node, json.dumps(node_value)) except kazoo.exceptions.NodeExistsError: pass if all_syc_finished: node_detail['status'] = 'ok' node_detail['finish_time'] = time.time() self._zookeeper.set( '%s/to_syc_notice/%s' % ( self._root_node, syc_node_id, ), json.dumps(node_detail)) def syc_execute(self, config_version, game_version, server_id): ''' to execute shell to zip resource ''' LOG.info('start to execute shell %s/syc.sh %s %s %s' % ( self._shell_path, config_version, game_version, server_id, )) result = subprocess.call('%s/syc.sh %s %s %s > /dev/null 2>&1' % ( self._shell_path, config_version, game_version, server_id, ), shell=True) return True if result == 0 else False
class USSMetadataManager(object): """Interfaces with the locking system to get, put, and delete USS metadata. Metadata gets/stores/deletes the USS information for a partiular grid, including current version number, a list of USSs with active operations, and the endpoints to get that information. Locking is assured through a snapshot token received when getting, and used when putting. """ def __init__(self, connectionstring=DEFAULT_CONNECTION, testgroupid=None): """Initializes the class. Args: connectionstring: Zookeeper connection string - server:port,server:port,... testgroupid: ID to use if in test mode, none for normal mode """ if testgroupid: self.set_testmode(testgroupid) if not connectionstring: connectionstring = DEFAULT_CONNECTION log.debug( 'Creating metadata manager object and connecting to zookeeper...') try: if set(BAD_CHARACTER_CHECK) & set(connectionstring): raise ValueError self.zk = KazooClient(hosts=connectionstring, timeout=CONNECTION_TIMEOUT) self.zk.add_listener(self.zookeeper_connection_listener) self.zk.start() if testgroupid: self.delete_testdata(testgroupid) except KazooTimeoutError: log.error( 'Unable to connect to zookeeper using %s connection string...', connectionstring) raise except ValueError: log.error('Connection string %s seems invalid...', connectionstring) raise def __del__(self): log.debug( 'Destroying metadata manager object and disconnecting from zk...') self.zk.stop() def set_verbose(self): log.setLevel(logging.DEBUG) def set_testmode(self, testgroupid='UNDEFINED_TESTER'): """Sets the mode to testing with the specific test ID, cannot be undone. Args: testgroupid: ID to use if in test mode, none for normal mode """ global GRID_PATH global CONNECTION_TIMEOUT # Adjust parameters specifically for the test GRID_PATH = TEST_BASE_PREFIX + testgroupid + USS_BASE_PREFIX log.debug('Setting test path to %s...', GRID_PATH) CONNECTION_TIMEOUT = 1.0 def zookeeper_connection_listener(self, state): if state == KazooState.LOST: # Register somewhere that the session was lost log.error('Lost connection with the zookeeper servers...') elif state == KazooState.SUSPENDED: # Handle being disconnected from Zookeeper log.error('Suspended connection with the zookeeper servers...') elif state == KazooState.CONNECTED: # Handle being connected/reconnected to Zookeeper log.info('Connection restored with the zookeeper servers...') def delete_testdata(self, testgroupid=None): """Removes the test data from the servers. Be careful when using this in parallel as it removes everything under the testgroupid, or everything if no tetgroupid is provided. Args: testgroupid: ID to use if in test mode, none will remove all test data """ if testgroupid: path = TEST_BASE_PREFIX + testgroupid else: path = TEST_BASE_PREFIX self.zk.delete(path, recursive=True) def get(self, z, x, y): """Gets the metadata and snapshot token for a GridCell. Reads data from zookeeper, including a snapshot token. The snapshot token is used as a reference when writing to ensure the data has not been updated between read and write. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ # TODO(hikevin): Change to use our own error codes and let the server # convert them to http error codes. For now, this is # at least in a standard JSend format. status = 500 if self._validate_slippy(z, x, y): (content, metadata) = self._get_raw(z, x, y) if metadata: try: m = uss_metadata.USSMetadata(content) status = 200 result = { 'status': 'success', 'sync_token': metadata.last_modified_transaction_id, 'data': m.to_json() } except ValueError: status = 424 else: status = 404 else: status = 400 if status != 200: result = self._format_status_code_to_jsend(status) return result def set(self, z, x, y, sync_token, uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation): """Sets the metadata for a GridCell. Writes data, using the snapshot token for confirming data has not been updated since it was last read. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format sync_token: token retrieved in the original GET GridCellMetadata, uss_id: plain text identifier for the USS, ws_scope: scope to use to obtain OAuth token, operation_format: output format for operation ws (i.e. NASA, GUTMA), operation_ws: submitting USS endpoint where all flights in this cell can be retrieved from, earliest_operation: lower bound of active or planned flight timestamp, used for quick filtering conflicts. latest_operation: upper bound of active or planned flight timestamp, used for quick filtering conflicts. Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ status = 500 if self._validate_slippy(z, x, y): # first we have to get the cell status = 0 (content, metadata) = self._get_raw(z, x, y) if metadata: # Quick check of the token, another is done on the actual set to be sure # but this check fails early and fast if str(metadata.last_modified_transaction_id) == str( sync_token): try: m = uss_metadata.USSMetadata(content) log.debug('Setting metadata for %s...', uss_id) if not m.upsert_operator( uss_id, ws_scope, operation_format, operation_ws, earliest_operation, latest_operation): log.error( 'Failed setting operator for %s with token %s...', uss_id, str(sync_token)) raise ValueError status = self._set_raw(z, x, y, m, uss_id, sync_token) except ValueError: status = 424 else: status = 409 else: status = 404 else: status = 400 if status == 200: # Success, now get the metadata back to send back result = self.get(z, x, y) else: result = self._format_status_code_to_jsend(status) return result def delete(self, z, x, y, uss_id): """Sets the metadata for a GridCell by removing the entry for the USS. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format uss_id: is the plain text identifier for the USS Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ status = 500 if self._validate_slippy(z, x, y): # first we have to get the cell (content, metadata) = self._get_raw(z, x, y) if metadata: try: m = uss_metadata.USSMetadata(content) m.remove_operator(uss_id) # TODO(pelletierb): Automatically retry on delete status = self._set_raw( z, x, y, m, uss_id, metadata.last_modified_transaction_id) except ValueError: status = 424 else: status = 404 else: status = 400 if status == 200: # Success, now get the metadata back to send back (content, metadata) = self._get_raw(z, x, y) result = { 'status': 'success', 'sync_token': metadata.last_modified_transaction_id, 'data': m.to_json() } else: result = self._format_status_code_to_jsend(status) return result ###################################################################### ################ INTERNAL FUNCTIONS ######################### ###################################################################### def _get_raw(self, z, x, y): """Gets the raw content and metadata for a GridCell from zookeeper. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format Returns: content: USS metadata metadata: straight from zookeeper """ path = GRID_PATH + '/'.join( (str(z), str(x), str(y))) + USS_METADATA_FILE log.debug('Getting metadata from zookeeper@%s...', path) self.zk.ensure_path(path) c, m = self.zk.get(path) if c: log.debug('Received raw content and metadata from zookeeper: %s', c) if m: log.debug('Received raw metadata from zookeeper: %s', m) return c, m def _set_raw(self, z, x, y, m, uss_id, sync_token): """Grabs the lock and updates the raw content for a GridCell in zookeeper. Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format m: metadata object to write uss_id: the plain text identifier for the USS sync_token: the sync token received during get operation Returns: 200 for success, 409 for conflict, 408 for unable to get the lock """ status = 500 path = GRID_PATH + '/'.join( (str(z), str(x), str(y))) + USS_METADATA_FILE # TODO(hikevin): Remove Lock and use built in set with version lock = self.zk.WriteLock(path, uss_id) try: log.debug('Getting metadata lock from zookeeper@%s...', path) lock.acquire(timeout=LOCK_TIMEOUT) (content, metadata) = self._get_raw(z, x, y) del content if str(metadata.last_modified_transaction_id) == str(sync_token): log.debug('Setting metadata to %s...', str(m)) self.zk.set(path, json.dumps(m.to_json())) status = 200 else: log.error( 'Sync token from USS (%s) does not match token from zk (%s)...', str(sync_token), str(metadata.last_modified_transaction_id)) status = 409 log.debug('Releasing the lock...') lock.release() except LockTimeout: log.error('Unable to acquire the lock for %s...', path) status = 408 return status def _format_status_code_to_jsend(self, status): """Formats a response based on HTTP status code. Args: status: HTTP status code Returns: JSend formatted response (https://labs.omniti.com/labs/jsend) """ if status == 200 or status == 204: result = { 'status': 'success', 'code': 204, 'message': 'Empty data set.' } elif status == 400: result = { 'status': 'fail', 'code': status, 'message': 'Parameters are not following the correct format.' } elif status == 404: result = { 'status': 'fail', 'code': status, 'message': 'Unable to pull metadata from lock system.' } elif status == 408: result = { 'status': 'fail', 'code': status, 'message': 'Timeout trying to get lock.' } elif status == 409: result = { 'status': 'fail', 'code': status, 'message': 'Content in metadata has been updated since provided sync token.' } elif status == 424: result = { 'status': 'fail', 'code': status, 'message': 'Content in metadata is not following JSON format guidelines.' } else: result = { 'status': 'fail', 'code': status, 'message': 'Unknown error code occurred.' } return result def _validate_slippy(self, z, x, y): """Validates slippy tile ranges. https://en.wikipedia.org/wiki/Tiled_web_map https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames Args: z: zoom level in slippy tile format x: x tile number in slippy tile format y: y tile number in slippy tile format Returns: true if valid, false if not """ try: z = int(z) x = int(x) y = int(y) if not 0 <= z <= 20: raise ValueError if not 0 <= x < 2**z: raise ValueError if not 0 <= y < 2**z: raise ValueError return True except (ValueError, TypeError): log.error('Invalid slippy format for tiles %sz, %s,%s!', z, x, y) return False
Vim�UnDo�X��5��+���ʭ���!Y�i���K���aƓp,!!!!a�#0_�2����))=v=a���12 lines_with_dns.remove5��1�5�_�3����))=v=a���24pH return re.findall(r"[\w.-]+\.prod\.booking\.com", lines_with_dns[0])5��2�5�_�3L����))=v=a���35q �35p5��3�3 �35�_�4����))=v=a���35q if HOSTNAME5��3�3�3 " �3 " 5�_�4����))=v=a���36q if HOSTNAME in alt_names5��3 " �3 " �3,�4-�455�_�5!����))=v=a���57s �57r5��5P �5T�5 [ 5�_� 6����))=v=a���57s return alt_names5��5 [ �5 [ 5�_�! &,����))=v=a��%'sM # The function returns the list of alterative names in puppet certificate5��%,�5�_� !&����36Va�#/�s#!/bin/blue-python3.8,# This script runs as a cron job many times.W# It triggers the puppet SSL certificate regeneration, waits for certificate revocation:# on puppet server side and removes the local certificate.#E# The second script (puppetserver-ensemble.py) runs on puppet serversG# and removes the node certificate from Puppet by request in Zookeeper. import os import sys import re import socketimport argparseimport logging/from subprocess import check_output, check_call$from kazoo.client import KazooClient'ZK_PATH_BASE = "/puppetserver/ensemble";ZK_PATH_REMOVE_REQUESTS = f"{ZK_PATH_BASE}/remove_requests"OZK_PATH_REMOVE_REQUESTS_PROCESSED = f"{ZK_PATH_BASE}/remove_requests_processed"HOSTNAME = socket.gethostname()>CERT_PATH = f"/etc/puppetlabs/puppet/ssl/certs/{HOSTNAME}.pem"logger = logging.getLogger()def read_bookings_env_var(var):5 with open("/etc/sysconfig/bookings.puppet") as f: lines = f.readlines() for line in lines:& if line.startswith(f"{var}="):( return line[len(f"{var}="):] return None0def get_certificate_alt_names(certificate_path):N # The function returns the list of alternative names in puppet certificatey # /bin/openssl x509 -in /etc/puppetlabs/puppet/ssl/certs/$(uname -n).pem -text | grep -A 1 'Subject Alternative Name'G command = f"/bin/openssl x509 -in {certificate_path} -text".split()G certificate_output = check_output(command, universal_newlines=True) if not certificate_output:S logger.info(f"Cannot get the certificate details from {certificate_path}.") return []X lines_with_dns = [line for line in certificate_output.split('\n') if 'DNS:' in line] if not lines_with_dns: return []M alt_names = re.findall(r"[\w.-]+\.prod\.booking\.com", lines_with_dns[0]) if HOSTNAME in alt_names:" alt_names.remove(HOSTNAME) return alt_namesdef remove_file(file): if os.path.exists(file): os.remove(file)- logging.debug(f"Removed file {file}")def main():Q parser = argparse.ArgumentParser(description="Regenerate puppet certificate") parser.add_argument( "--debug", action="store_const", const=True, default=False,$ help="More verbose logging", ) args = parser.parse_args()9 level = logging.DEBUG if args.debug else logging.INFO logging.basicConfig( level=level,F format="[{levelname:7s}| {asctime} | {module:10s}] {message}", style="{", stream=sys.stdout, )% if not os.path.exists(CERT_PATH):L logging.info(f"Certificate does not exist at {CERT_PATH}. Exiting.") returnF zk = KazooClient(hosts=read_bookings_env_var("ZOOKEEPER_CLUSTER")) zk.start()4 alt_names = get_certificate_alt_names(CERT_PATH) if alt_names:e logging.info(f"Certificate {CERT_PATH} has alt names {alt_names}. Restarting nginx service.")$ # Cleanup request processingH if zk.exists(f"{ZK_PATH_REMOVE_REQUESTS_PROCESSED}/{HOSTNAME}"):H zk.delete(f"{ZK_PATH_REMOVE_REQUESTS_PROCESSED}/{HOSTNAME}"): check_call("/bin/systemctl restart nginx".split()) returnD if zk.exists(f"{ZK_PATH_REMOVE_REQUESTS_PROCESSED}/{HOSTNAME}"):� logging.info(f"Request to remove certificate is processed: {ZK_PATH_REMOVE_REQUESTS_PROCESSED}/{HOSTNAME} exists. Clean up the local certificate.")& # cleanup existing certificateG remove_file(f"/etc/puppetlabs/puppet/ssl/certs/{HOSTNAME}.pem")N remove_file(f"/etc/puppetlabs/puppet/ssl/private_keys/{HOSTNAME}.pem")M remove_file(f"/etc/puppetlabs/puppet/ssl/public_keys/{HOSTNAME}.pem")0 logging.info("Restarting puppet daemon"); check_call("/bin/systemctl restart puppet".split()) else:n # if the requset to clean certificate on puppet server side is not processed then make cleanup request8 logging.info("Request the certificate cleanup.")? zk.ensure_path(f"{ZK_PATH_REMOVE_REQUESTS}/{HOSTNAME}")if __name__ == '__main__': main()5�5�_�1����))=v=a� ��12p �13q if 'DNS:{HOSTNAME}5��1� �1��1��1��2�5�_�3����))=v=a� ��24r if f'DNS:{HOSTNAME}5��2�5�_�3����))=v=a� ��24r+ if f'DNS:{HOSTNAME}' in lines_with_dns:5��2 � �2��2��2��2+��3 � �3��3��3��3�3�3�3�3�3�35�_�4����42Va� ��35s lines_with_dns.remove5��35�_�2����22Va� ��155��1�K5�_�/W����22Va� ��.0py lines_with_dns = [line for line in certificate_output.split('\n') if 'DNS:' in line and 'DNS:{HOSTNAME}' not in line]5��.W!�!5�_�/\����88Va� ��/0p �01q�/1q lines_with_dns = [ line2 for line in certificate_output.split("\n"); if "DNS:" in line and f"DNS:{HOSTNAME}" not in line ]5��/��/��5�_� 0����88Va� ��/1v lines_with_dns = [5��/�5�_� /����77Va� ��.05��.:z5�_� 4����/3Va� ��355��3�5�_�/����/3Va�W�.0t lines_with_alt_dns = [5��.I5�_�/����/3Va�[�.0t lines_with_alt_names = [5��.M5�_�4����/3Va�a�35t if not lines_with_alt_names:5��3��3 � �3�5�_�4����/3Va�c�35t if not lines_with_alt_names:5��3 � �3 � 5�_�7A����/3Va�h�68tE return re.findall(r"[\w.-]+\.prod\.booking\.com", lines_with_[0])5��6AN5�_�7A����/3Va�i�68tN return re.findall(r"[\w.-]+\.prod\.booking\.com", lines_with_alt_names[0])5��6AN�66
def get_query_server_config(name='beeswax', connector=None): if connector and has_connectors(): # TODO: Give empty connector when no connector in use query_server = get_query_server_config_via_connector(connector) else: LOG.debug("Query cluster %s" % name) if name == "llap": activeEndpoint = cache.get('llap') if activeEndpoint is None: if HIVE_DISCOVERY_LLAP.get(): LOG.debug("Checking zookeeper for Hive Server Interactive endpoint") zk = KazooClient(hosts=libzookeeper_conf.ENSEMBLE.get(), read_only=True) zk.start() if HIVE_DISCOVERY_LLAP_HA.get(): znode = "{0}/instances".format(HIVE_DISCOVERY_LLAP_ZNODE.get()) LOG.debug("Setting up LLAP with the following node {0}".format(znode)) if zk.exists(znode): hiveservers = zk.get_children(znode) for server in hiveservers: llap_servers= json.loads(zk.get("{0}/{1}".format(znode, server))[0])["internal"][0] if llap_servers["api"] == "activeEndpoint": cache.set("llap", json.dumps({"host": llap_servers["addresses"][0]["host"], "port": llap_servers["addresses"][0]["port"]}), CACHE_TIMEOUT.get()) else: LOG.error("LLAP Endpoint not found, reverting to HiveServer2") cache.set("llap", json.dumps({"host": HIVE_SERVER_HOST.get(), "port": HIVE_HTTP_THRIFT_PORT.get()}), CACHE_TIMEOUT.get()) else: znode = "{0}".format(HIVE_DISCOVERY_LLAP_ZNODE.get()) LOG.debug("Setting up LLAP with the following node {0}".format(znode)) if zk.exists(znode): hiveservers = zk.get_children(znode) for server in hiveservers: cache.set("llap", json.dumps({"host": server.split(';')[0].split('=')[1].split(":")[0], "port": server.split(';')[0].split('=')[1].split(":")[1]})) zk.stop() else: LOG.debug("Zookeeper Discovery not enabled, reverting to config values") cache.set("llap", json.dumps({"host": LLAP_SERVER_HOST.get(), "port": LLAP_SERVER_THRIFT_PORT.get()}), CACHE_TIMEOUT.get()) activeEndpoint = json.loads(cache.get("llap")) elif name != 'hms' and name != 'impala': activeEndpoint = cache.get("hiveserver2") if activeEndpoint is None: if HIVE_DISCOVERY_HS2.get(): zk = KazooClient(hosts=libzookeeper_conf.ENSEMBLE.get(), read_only=True) zk.start() znode = HIVE_DISCOVERY_HIVESERVER2_ZNODE.get() LOG.info("Setting up Hive with the following node {0}".format(znode)) if zk.exists(znode): hiveservers = zk.get_children(znode) server_to_use = 0 # if CONF.HIVE_SPREAD.get() randint(0, len(hiveservers)-1) else 0 cache.set("hiveserver2", json.dumps({"host": hiveservers[server_to_use].split(";")[0].split("=")[1].split(":")[0], "port": hiveservers[server_to_use].split(";")[0].split("=")[1].split(":")[1]})) else: cache.set("hiveserver2", json.dumps({"host": HIVE_SERVER_HOST.get(), "port": HIVE_HTTP_THRIFT_PORT.get()})) zk.stop() else: cache.set("hiveserver2", json.dumps({"host": HIVE_SERVER_HOST.get(), "port": HIVE_HTTP_THRIFT_PORT.get()})) activeEndpoint = json.loads(cache.get("hiveserver2")) if name == 'impala': from impala.dbms import get_query_server_config as impala_query_server_config query_server = impala_query_server_config() elif name == 'hms': kerberos_principal = hive_site.get_hiveserver2_kerberos_principal(HIVE_SERVER_HOST.get()) query_server = { 'server_name': 'hms', 'server_host': HIVE_METASTORE_HOST.get() if not cluster_config else cluster_config.get('server_host'), 'server_port': HIVE_METASTORE_PORT.get(), 'principal': kerberos_principal, 'transport_mode': 'http' if hive_site.hiveserver2_transport_mode() == 'HTTP' else 'socket', 'auth_username': AUTH_USERNAME.get(), 'auth_password': AUTH_PASSWORD.get(), 'use_sasl': HIVE_USE_SASL.get() } else: kerberos_principal = hive_site.get_hiveserver2_kerberos_principal(HIVE_SERVER_HOST.get()) query_server = { 'server_name': 'beeswax', 'server_host': activeEndpoint["host"], 'server_port': LLAP_SERVER_PORT.get() if name == 'llap' else HIVE_SERVER_PORT.get(), 'principal': kerberos_principal, 'http_url': '%(protocol)s://%(host)s:%(port)s/%(end_point)s' % { 'protocol': 'https' if hiveserver2_use_ssl() else 'http', 'host': activeEndpoint["host"], 'port': activeEndpoint["port"], 'end_point': hive_site.hiveserver2_thrift_http_path() }, 'transport_mode': 'http' if hive_site.hiveserver2_transport_mode() == 'HTTP' else 'socket', 'auth_username': AUTH_USERNAME.get(), 'auth_password': AUTH_PASSWORD.get(), 'use_sasl': HIVE_USE_SASL.get(), 'close_sessions': CLOSE_SESSIONS.get(), 'has_session_pool': has_session_pool(), 'max_number_of_sessions': MAX_NUMBER_OF_SESSIONS.get() } if name == 'sparksql': # Extends Hive as very similar from spark.conf import SQL_SERVER_HOST as SPARK_SERVER_HOST, SQL_SERVER_PORT as SPARK_SERVER_PORT, USE_SASL as SPARK_USE_SASL query_server.update({ 'server_name': 'sparksql', 'server_host': SPARK_SERVER_HOST.get(), 'server_port': SPARK_SERVER_PORT.get(), 'use_sasl': SPARK_USE_SASL.get() }) if not query_server.get('dialect'): query_server['dialect'] = query_server['server_name'] debug_query_server = query_server.copy() debug_query_server['auth_password_used'] = bool(debug_query_server.pop('auth_password', None)) LOG.debug("Query Server: %s" % debug_query_server) return query_server
class ZooAnimal: def __init__(self): self.zk = KazooClient(hosts=ZOOKEEPER_LOCATION) self.zk.start() # Use util function to get IP address self.ipaddress = [ ip for ip in list(local_ip4_addr_list()) if ip.startswith(NETWORK_PREFIX) ][0] # Inheriting children should assign values to fit the scheme # /role/topic self.role = None self.topic = None #Will only be set by pub and sub self.broker = None # Zookeeper #self.election = None self.election = self.zk.Election('/broker', self.ipaddress) self.zk_seq_id = None self.zk_is_a_master = False def zookeeper_watcher(self, watch_path): @self.zk.DataWatch(watch_path) def zookeeper_election(data, stat, event): print("Setting election watch.") print("Watching node -> ", data) if data is None: print("Data is none.") self.election.run(self.zookeeper_register) #self.election.cancel() def zookeeper_master(self): if not self.zk_is_a_master: print("ZOOANIMAL -> Becoming a master.") role_topic = "/broker/master" data = {'ip': self.ipaddress} data_string = json.dumps(data) encoded_ip = codecs.encode(data_string, "utf-8") self.zk.create(role_topic, ephemeral=True, makepath=True, sequence=True, value=encoded_ip) self.zk_is_a_master = True return self.zk_is_a_master def zookeeper_register(self): pass # This is a function stub for the get_broker watch callback # The child is expected to implement their own logic # Pub and Sub need to register_sub() def broker_update(self, data): print("Broker updated.") print("Data -> {}".format(data)) pass def get_broker(self): for i in range(10): if self.zk.exists(PATH_TO_MASTER_BROKER): node_data = self.zk.get(PATH_TO_MASTER_BROKER, watch=self.broker_update) broker_data = node_data[0] master_broker = codecs.decode(broker_data, 'utf-8') if master_broker != '': self.broker = master_broker return self.broker else: raise Exception("No master broker.") time.sleep(0.2)
class KazooCommandProxy(): def __init__(self, module): self.module = module self.zk = KazooClient(module.params['hosts']) def absent(self): return self._absent(self.module.params['name']) def exists(self, znode): return self.zk.exists(znode) def list(self): children = self.zk.get_children(self.module.params['name']) return True, { 'count': len(children), 'items': children, 'msg': 'Retrieved znodes in path.', 'znode': self.module.params['name'] } def present(self): return self._present(self.module.params['name'], self.module.params['value']) def get(self): return self._get(self.module.params['name']) def shutdown(self): self.zk.stop() self.zk.close() def start(self): self.zk.start() def wait(self): return self._wait(self.module.params['name'], self.module.params['timeout']) def _absent(self, znode): if self.exists(znode): self.zk.delete(znode, recursive=self.module.params['recursive']) return True, {'changed': True, 'msg': 'The znode was deleted.'} else: return True, {'changed': False, 'msg': 'The znode does not exist.'} def _get(self, path): if self.exists(path): value, zstat = self.zk.get(path) stat_dict = {} for i in dir(zstat): if not i.startswith('_'): attr = getattr(zstat, i) if isinstance(attr, (int, str)): stat_dict[i] = attr result = True, { 'msg': 'The node was retrieved.', 'znode': path, 'value': value, 'stat': stat_dict } else: result = False, {'msg': 'The requested node does not exist.'} return result def _present(self, path, value): if self.exists(path): (current_value, zstat) = self.zk.get(path) if value != current_value: self.zk.set(path, to_bytes(value)) return True, { 'changed': True, 'msg': 'Updated the znode value.', 'znode': path, 'value': value } else: return True, { 'changed': False, 'msg': 'No changes were necessary.', 'znode': path, 'value': value } else: self.zk.create(path, to_bytes(value), makepath=True) return True, { 'changed': True, 'msg': 'Created a new znode.', 'znode': path, 'value': value } def _wait(self, path, timeout, interval=5): lim = time.time() + timeout while time.time() < lim: if self.exists(path): return True, { 'msg': 'The node appeared before the configured timeout.', 'znode': path, 'timeout': timeout } else: time.sleep(interval) return False, { 'msg': 'The node did not appear before the operation timed out.', 'timeout': timeout, 'znode': path }
def zookeeper_resolve_leader(addresses, path): """ Resolve the leader using a znode path. ZooKeeper imposes a total order on the elements of the queue, guaranteeing that the oldest element of the queue is the first one. We can thus return the first address we get from ZooKeeper. """ hosts = ",".join(addresses) try: zk = KazooClient(hosts=hosts) zk.start() except Exception as exception: raise CLIException( "Unable to initialize Zookeeper Client: {error}".format( error=exception)) try: children = zk.get_children(path) except Exception as exception: raise CLIException( "Unable to get children of {zk_path}: {error}".format( zk_path=path, error=exception)) masters = sorted( # 'json.info' is the prefix for master nodes. child for child in children if child.startswith("json.info")) address = "" for master in masters: try: node_path = "{path}/{node}".format(path=path, node=master) json_data, _ = zk.get(node_path) except Exception as exception: raise CLIException( "Unable to get the value of '{node}': {error}".format( node=node_path, error=exception)) try: data = json.loads(json_data) except Exception as exception: raise CLIException( "Could not load JSON from '{data}': {error}".format( data=data, error=str(exception))) if ("address" in data and "ip" in data["address"] and "port" in data["address"]): address = "{ip}:{port}".format(ip=data["address"]["ip"], port=data["address"]["port"]) break try: zk.stop() except Exception as exception: raise CLIException( "Unable to stop Zookeeper Client: {error}".format(error=exception)) if not address: raise CLIException("Unable to resolve the leading" " master using ZooKeeper") return address
def read_config_file(self, config_file=None): """ Read configuration file and initialize object. If config file is None, it will use default value :param config_file: path to configuration file :return: """ # Stop storage service self.stop_storage_service() config = configparser.ConfigParser() if config_file is not None: config.read(os.path.realpath(config_file)) # Main configuration self.__id = config.get("OPV", "id", fallback="ID") self.__path = config.get("OPV", "path", fallback="directory_manager_storage") self.__path = os.path.realpath(os.path.expanduser(self.__path)) self.__host = config.get("OPV", "host", fallback=socket.gethostbyname( socket.gethostname())) uid_generator_type = config.get("OPV", "uid_type", fallback="basic").upper() # FTP configuration ftp_host = config.get("FTP", "host", fallback="0.0.0.0") ftp_port = config.getint("FTP", "port", fallback=2121) ftp_logfile = config.get("FTP", "logfile", fallback="opv_directory_manager_ftp.log") # HTTP configuration http_host = config.get("HTTP", "host", fallback="0.0.0.0") http_port = config.getint("HTTP", "port", fallback=5050) http_logfile = config.get("HTTP", "logfile", fallback="opv_directory_manager_http.log") # Id if uid_generator_type in ["ZOOKEEPER", "ZK"]: zk_hosts = config.get("ZOOKEEPER", "hosts", fallback="127.0.0.1:2181") zk_path = config.get("ZOOKEEPER", "path", fallback="/DirectoryManager/increment") print(zk_hosts) zk = KazooClient(zk_hosts) zk.start() self.__uid_generator = ZkIDGenerator(zk, path=zk_path, prefix=self.__id) else: self.__uid_generator = BasicIDGenerator(prefix=self.__id) # Storage self.__storage = LocalStorage(self.__path) # FTP ftp_storage_service = FTP(self.__path, host=self.__host, listen_host=ftp_host, listen_port=ftp_port, logfile=ftp_logfile) # HTTP http_storage_service = HTTP(self.__path, host=self.__host, listen_host=http_host, listen_port=http_port, logfile=http_logfile) # Local local_storage_service = LocalStorageService(self.__path) # Storage service self.__storage_service_manager = StorageServiceManager( "ftp", ftp_storage_service) self.__storage_service_manager.addURI("file", local_storage_service) self.__storage_service_manager.addURI("http", http_storage_service)
class StaggerLock(object): def __init__(self, temp_path, timeout, parent='None', acquire_lock=None, app_state=None): """ :type temp_path: str :type timeout: int :type parent: str :type acquire_lock: zoom.agent.entities.thread_safe_object.ThreadSafeObject or None :type app_state: zoom.agent.entities.thread_safe_object.ThreadSafeObject or None """ self._path = temp_path self._timeout = timeout self._parent = parent self._thread = None self._prev_state = None self._zk = KazooClient(hosts=get_zk_conn_string(), timeout=60.0) self._zk.add_listener(self._zk_listener) self._log = logging.getLogger('sent.{0}.sl'.format(parent)) self._counter = 0 self._acquire_lock = acquire_lock self._app_state = app_state def join(self): if self._thread is not None and self._zk.connected: self._thread.join() self._close() else: return def start(self): """ This method is to implement a staggered startup. A new KazooClient is instantiated b/c of thread-safety issues with the election. """ self._zk.start() self._acquire_lock.set_value(True) self._app_state.set_value(ApplicationState.STAGGERED) self._acquire() def _acquire(self): try: while self._acquire_lock.value: if self._zk.connected: lock = self._zk.Lock(self._path, identifier=platform.node()) if lock.acquire(blocking=True, timeout=5): self._thread = Thread(target=self._sleep_and_unlock, args=(lock,), name=str(self)) self._thread.daemon = True self._thread.start() break else: pass else: self._log.info('No connection to ZK. Will not try to ' 'acquire stagger lock.') except LockTimeout: self._log.debug('Lock timed out. Trying to acquire lock again.') self._acquire() except Exception as e: self._log.error('Unhandled exception: {0}'.format(e)) def _close(self): try: self._thread = None self._zk.stop() self._zk.close() # TypeError happens when stop() is called when already stopping except TypeError: pass except Exception as e: self._log.debug('Unhandled exception: {0}'.format(e)) @catch_exception(ConnectionClosedError) def _sleep_and_unlock(self, lck): self._log.info('Got stagger lock. Sleeping for {0} seconds.' .format(self._timeout)) time.sleep(self._timeout) lck.release() self._log.info('Released stagger lock.') def _close_connection(self): self._close() self._acquire_lock.set_value(False) def _zk_listener(self, state): """ The callback function that runs when the connection state to Zookeeper changes. Either passes or immediately spawns a new thread that resets any watches, etc., so that it can listen to future connection state changes. """ try: self._log.info('Zookeeper Connection went from {0} to {1}' .format(self._prev_state, state)) if self._prev_state is None and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.LOST and state == KazooState.CONNECTED: pass elif self._prev_state == KazooState.CONNECTED and state == KazooState.SUSPENDED: self._zk.handler.spawn(self._close_connection) elif self._prev_state == KazooState.CONNECTED and state == KazooState.LOST: self._zk.handler.spawn(self._close_connection) elif self._prev_state == KazooState.SUSPENDED and state == KazooState.LOST: self._zk.handler.spawn(self._close_connection) elif self._prev_state == KazooState.SUSPENDED and state == KazooState.CONNECTED: pass elif state == KazooState.CONNECTED: self._zk.handler.spawn(self._close_connection) else: self._log.info('Zookeeper Connection in unknown state: {0}' .format(state)) return self._prev_state = state except Exception: self._log.exception('An uncaught exception has occurred') def __repr__(self): return 'StaggerLock(path={0}, timeout={1})'.format(self._path, self._timeout) def __str__(self): return self.__repr__()
class AnalyticsDiscovery(gevent.Greenlet): def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER, name = self._svc_name, status = new_conn_state, message = message, server_addrs = self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' %(message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state #import pdb; pdb.set_trace() # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Analytics Discovery listen %s" % str(state)) if state == KazooState.CONNECTED: if self._conn_state != ConnectionStatus.UP: self._sandesh_connection_info_update(status='UP', message='') self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo)) self._reconnect = True else: self._logger.error("Analytics Discovery already connected") else: self._logger.error("Analytics Discovery NOT connected") if self._conn_state == ConnectionStatus.UP: self._sandesh_connection_info_update(status='DOWN', message='') def _zk_datawatch(self, watcher, child, data, stat, event): self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (watcher, child, data, event)) self._wchildren[watcher][child] = data if self._watchers[watcher]: self._watchers[watcher](self._wchildren[watcher]) def _zk_watcher(self, watcher, children): self._logger.error("Analytics Discovery Children %s" % children) self._reconnect = True def __init__(self, logger, zkservers, svc_name, inst, watchers={}, zpostfix=""): gevent.Greenlet.__init__(self) self._svc_name = svc_name self._inst = inst self._zk_server = zkservers # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') self._zk = KazooClient(hosts=zkservers) self._pubinfo = None self._watchers = watchers self._wchildren = {} self._zpostfix = zpostfix self._basepath = "/analytics-discovery-" + self._zpostfix self._reconnect = None def publish(self, pubinfo): self._pubinfo = pubinfo #import pdb; pdb.set_trace() if self._conn_state == ConnectionStatus.UP: try: self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name)) self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state)) self._zk.ensure_path(self._basepath + "/" + self._svc_name) self._logger.error("check for %s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)) if pubinfo is not None: if self._zk.exists("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst)): self._zk.set("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo) else: self._zk.create("%s/%s/%s" % \ (self._basepath, self._svc_name, self._inst), self._pubinfo, ephemeral=True) else: self._logger.error("cannot publish empty info") except Exception as ex: template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._sandesh_connection_info_update(status='DOWN', message='') self._reconnect = True else: self._logger.error("Analytics Discovery cannot publish while down") def _run(self): while True: try: self._zk.start() break except gevent.event.Timeout as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) # Zookeeper is also throwing exception due to delay in master election except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) gevent.sleep(1) try: # Update connection info self._sandesh_connection_info_update(status='UP', message='') self._reconnect = False # Done connecting to ZooKeeper self._zk.add_listener(self._zk_listen) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) self._wchildren[wk] = {} self._zk.ChildrenWatch(self._basepath + "/" + wk, partial(self._zk_watcher, wk)) # Trigger the initial publish self._reconnect = True while True: try: gevent.sleep(10) # If a reconnect happens during processing, don't lose it while self._reconnect: self._reconnect = False if self._pubinfo: self.publish(self._pubinfo) for wk in self._watchers.keys(): self._zk.ensure_path(self._basepath + "/" + wk) children = self._zk.get_children(self._basepath + "/" + wk) old_children = set(self._wchildren[wk].keys()) new_children = set(children) # Remove contents for the children who are gone # (DO NOT remove the watch) for elem in old_children - new_children: self._wchildren[wk][elem] = None # Overwrite existing children, or create new ones for elem in new_children: # Create a watch for new children if elem not in self._wchildren[wk]: self._zk.DataWatch(self._basepath + "/" + \ wk + "/" + elem, partial(self._zk_datawatch, wk, elem)) self._wchildren[wk][elem], _ = \ self._zk.get(self._basepath + "/" + wk + "/" + elem) self._logger.error(\ "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \ (wk, elem, self._wchildren[wk][elem], "GET")) if self._watchers[wk]: self._watchers[wk](self._wchildren[wk]) except gevent.GreenletExit: self._logger.error("Exiting AnalyticsDiscovery for %s" % \ self._svc_name) self._zk.stop() break except Exception as ex: template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) self._reconnect = True except Exception as ex: template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s info %s" % \ (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo))) raise SystemExit
class ZooKeeper(AbstractDCS): def __init__(self, config): super(ZooKeeper, self).__init__(config) hosts = config.get('hosts', []) if isinstance(hosts, list): hosts = ','.join(hosts) self._client = KazooClient( hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']), timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1, sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, sleep_func=time.sleep)) self._client.add_listener(self.session_listener) self._fetch_cluster = True self._orig_kazoo_connect = self._client._connection._connect self._client._connection._connect = self._kazoo_connect self._client.start() def _kazoo_connect(self, *args): """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no response on Ping after Ping interval (1/2 from read_timeout) it will consider current connection dead and try to connect to another node. Without this "magic" it was taking up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had only small time for reconnect and retry. This method is needed to return different value of read_timeout, which is not calculated from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and `write_leader_optime()` methods, which also may hang...""" ret = self._orig_kazoo_connect(*args) return max(self.loop_wait - 2, 2) * 1000, ret[1] def session_listener(self, state): if state in [KazooState.SUSPENDED, KazooState.LOST]: self.cluster_watcher(None) def cluster_watcher(self, event): self._fetch_cluster = True self.event.set() def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) loop_wait = config['loop_wait'] loop_wait_changed = self._loop_wait != loop_wait self._loop_wait = loop_wait self._client.handler.set_connect_timeout(loop_wait) # We need to reestablish connection to zookeeper if we want to change # read_timeout (and Ping interval respectively), because read_timeout # is calculated in `_kazoo_connect` method. If we are changing ttl at # the same time, set_ttl method will reestablish connection and return # `!True`, otherwise we will close existing connection and let kazoo # open the new one. if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed: self._client._connection._socket.close() def set_ttl(self, ttl): """It is not possible to change ttl (session_timeout) in zookeeper without destroying old session and creating the new one. This method returns `!True` if session_timeout has been changed (`restart()` has been called).""" if self._client._session_timeout != ttl: self._client._session_timeout = ttl self._client.restart() return True @property def ttl(self): return self._client._session_timeout def set_retry_timeout(self, retry_timeout): retry = self._client.retry if isinstance( self._client.retry, KazooRetry) else self._client._retry retry.deadline = retry_timeout def get_node(self, key, watch=None): try: ret = self._client.get(key, watch) return (ret[0].decode('utf-8'), ret[1]) except NoNodeError: return None @staticmethod def member(name, value, znode): return Member.from_node(znode.version, name, znode.ephemeralOwner, value) def get_children(self, key, watch=None): try: return self._client.get_children(key, watch) except NoNodeError: return [] def load_members(self, sync_standby): members = [] for member in self.get_children(self.members_path, self.cluster_watcher): watch = member == sync_standby and self.cluster_watcher or None data = self.get_node(self.members_path + member, watch) if data is not None: members.append(self.member(member, *data)) return members def _inner_load_cluster(self): self._fetch_cluster = False self.event.clear() nodes = set( self.get_children(self.client_path(''), self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration config = self.get_node( self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history history = self.get_node( self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node( history[1].mzxid, history[0]) # get last leader operation last_leader_operation = self._OPTIME in nodes and self._fetch_cluster and self.get_node( self.leader_optime_path) last_leader_operation = last_leader_operation and int( last_leader_operation[0]) or 0 # get synchronization state sync = self.get_node( self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members sync_standby = sync.leader == self._name and sync.sync_standby or None members = self.load_members( sync_standby) if self._MEMBERS[:-1] in nodes else [] # get leader leader = self.get_node( self.leader_path) if self._LEADER in nodes else None if leader: client_id = self._client.client_id if not self._ctl and leader[0] == self._name and client_id is not None \ and client_id[0] != leader[1].ephemeralOwner: logger.info( 'I am leader but not owner of the session. Removing leader node' ) self._client.delete(self.leader_path) leader = None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] leader = Leader(leader[1].version, leader[1].ephemeralOwner, member) self._fetch_cluster = member.index == -1 # failover key failover = self.get_node( self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history) def _load_cluster(self): cluster = self.cluster if self._fetch_cluster or cluster is None: try: cluster = self._client.retry(self._inner_load_cluster) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) raise ZooKeeperError('ZooKeeper in not responding properly') return cluster def _create(self, path, value, retry=False, ephemeral=False): try: if retry: self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral) else: self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1) return True except Exception: logger.exception('Failed to create %s', path) return False def attempt_to_acquire_leader(self, permanent=False): ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent) if not ret: logger.info('Could not take out TTL lock') return ret def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False): value = value.encode('utf-8') try: if retry: self._client.retry(self._client.set, key, value, version=index or -1) else: self._client.set_async(key, value, version=index or -1).get(timeout=1) return True except NoNodeError: if do_not_create_empty and not value: return True elif index is None: return self._create(key, value, retry) else: return False except Exception: logger.exception('Failed to update %s', key) return False def set_failover_value(self, value, index=None): return self._set_or_create(self.failover_path, value, index) def set_config_value(self, value, index=None): return self._set_or_create(self.config_path, value, index, retry=True) def initialize(self, create_new=True, sysid=""): sysid = sysid.encode('utf-8') return self._create(self.initialize_path, sysid, retry=True) if create_new \ else self._client.retry(self._client.set, self.initialize_path, sysid) def touch_member(self, data, permanent=False): cluster = self.cluster member = cluster and cluster.get_member(self._name, fallback_to_leader=False) encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8') if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and member.data.get('version') == data.get('version') and member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))): try: self._client.delete_async(self.member_path).get(timeout=1) except NoNodeError: pass except Exception: return False member = None if member: if deep_compare(data, member.data): return True else: try: self._client.create_async( self.member_path, encoded_data, makepath=True, ephemeral=not permanent).get(timeout=1) return True except Exception as e: if not isinstance(e, NodeExistsError): logger.exception('touch_member') return False try: self._client.set_async(self.member_path, encoded_data).get(timeout=1) return True except Exception: logger.exception('touch_member') return False def take_leader(self): return self.attempt_to_acquire_leader() def _write_leader_optime(self, last_operation): return self._set_or_create(self.leader_optime_path, last_operation) def _update_leader(self): return True def delete_leader(self): self._client.restart() return True def _cancel_initialization(self): node = self.get_node(self.initialize_path) if node: self._client.delete(self.initialize_path, version=node[1].version) def cancel_initialization(self): try: self._client.retry(self._cancel_initialization) except Exception: logger.exception("Unable to delete initialize key") def delete_cluster(self): try: return self._client.retry(self._client.delete, self.client_path(''), recursive=True) except NoNodeError: return True def set_history_value(self, value): return self._set_or_create(self.history_path, value) def set_sync_state_value(self, value, index=None): return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True) def delete_sync_state(self, index=None): return self.set_sync_state_value("{}", index) def watch(self, leader_index, timeout): if super(ZooKeeper, self).watch(leader_index, timeout): self._fetch_cluster = True return self._fetch_cluster
class ZKHandler(object): def __init__(self, config, logger=None): """ Initialize an instance of the ZKHandler class with config A zk_conn object will be created but not started A ZKSchema instance will be created """ self.encoding = "utf8" self.coordinators = config["coordinators"] self.logger = logger self.zk_conn = KazooClient(hosts=self.coordinators) self._schema = ZKSchema() # # Class meta-functions # def coordinators(self): return str(self.coordinators) def log(self, message, state=""): if self.logger is not None: self.logger.out(message, state) else: print(message) # # Properties # @property def schema(self): return self._schema # # State/connection management # def listener(self, state): """ Listen for KazooState changes and log accordingly. This function does not do anything except for log the state, and Kazoo handles the rest. """ if state == KazooState.CONNECTED: self.log("Connection to Zookeeper resumed", state="o") else: self.log( "Connection to Zookeeper lost with state {}".format(state), state="w") def connect(self, persistent=False): """ Start the zk_conn object and connect to the cluster """ try: self.zk_conn.start() if persistent: self.log("Connection to Zookeeper started", state="o") self.zk_conn.add_listener(self.listener) except Exception as e: raise ZKConnectionException(self, e) def disconnect(self, persistent=False): """ Stop and close the zk_conn object and disconnect from the cluster The class instance may be reused later (avoids persistent connections) """ self.zk_conn.stop() self.zk_conn.close() if persistent: self.log("Connection to Zookeeper terminated", state="o") # # Schema helper actions # def get_schema_path(self, key): """ Get the Zookeeper path for {key} from the current schema based on its format. If {key} is a tuple of length 2, it's treated as a path plus an item instance of that path (e.g. a node, a VM, etc.). If {key} is a tuple of length 4, it is treated as a path plus an item instance, as well as another item instance of the subpath. If {key} is just a string, it's treated as a lone path (mostly used for the 'base' schema group. Otherwise, returns None since this is not a valid key. This function also handles the special case where a string that looks like an existing path (i.e. starts with '/') is passed; in that case it will silently return the same path back. This was mostly a migration functionality and is deprecated. """ if isinstance(key, tuple): # This is a key tuple with both an ipath and an item if len(key) == 2: # 2-length normal tuple ipath, item = key elif len(key) == 4: # 4-length sub-level tuple ipath, item, sub_ipath, sub_item = key return self.schema.path(ipath, item=item) + self.schema.path( sub_ipath, item=sub_item) else: # This is an invalid key return None elif isinstance(key, str): # This is a key string with just an ipath ipath = key item = None # This is a raw key path, used by backup/restore functionality if re.match(r"^/", ipath): return ipath else: # This is an invalid key return None return self.schema.path(ipath, item=item) # # Key Actions # def exists(self, key): """ Check if a key exists """ path = self.get_schema_path(key) if path is None: # This path is invalid, this is likely due to missing schema entries, so return False return False stat = self.zk_conn.exists(path) if stat: return True else: return False def read(self, key): """ Read data from a key """ try: path = self.get_schema_path(key) if path is None: # This path is invalid; this is likely due to missing schema entries, so return None return None return self.zk_conn.get(path)[0].decode(self.encoding) except NoNodeError: return None def write(self, kvpairs): """ Create or update one or more keys' data """ if type(kvpairs) is not list: self.log("ZKHandler error: Key-value sequence is not a list", state="e") return False transaction = self.zk_conn.transaction() for kvpair in kvpairs: if type(kvpair) is not tuple: self.log( "ZKHandler error: Key-value pair '{}' is not a tuple". format(kvpair), state="e", ) return False key = kvpair[0] value = kvpair[1] path = self.get_schema_path(key) if path is None: # This path is invalid; this is likely due to missing schema entries, so continue continue if not self.exists(key): # Creating a new key transaction.create(path, str(value).encode(self.encoding)) else: # Updating an existing key data = self.zk_conn.get(path) version = data[1].version # Validate the expected version after the execution new_version = version + 1 # Update the data transaction.set_data(path, str(value).encode(self.encoding)) # Check the data try: transaction.check(path, new_version) except TypeError: self.log( "ZKHandler error: Key '{}' does not match expected version" .format(path), state="e", ) return False try: transaction.commit() return True except Exception as e: self.log( "ZKHandler error: Failed to commit transaction: {}".format(e), state="e") return False def delete(self, keys, recursive=True): """ Delete a key or list of keys (defaults to recursive) """ if type(keys) is not list: keys = [keys] for key in keys: if self.exists(key): try: path = self.get_schema_path(key) self.zk_conn.delete(path, recursive=recursive) except Exception as e: self.log( "ZKHandler error: Failed to delete key {}: {}".format( path, e), state="e", ) return False return True def children(self, key): """ Lists all children of a key """ try: path = self.get_schema_path(key) if path is None: # This path is invalid; this is likely due to missing schema entries, so return None return None return self.zk_conn.get_children(path) except NoNodeError: return None def rename(self, kkpairs): """ Rename one or more keys to a new value """ if type(kkpairs) is not list: self.log("ZKHandler error: Key-key sequence is not a list", state="e") return False transaction = self.zk_conn.transaction() def rename_element(transaction, source_path, destination_path): data = self.zk_conn.get(source_path)[0] transaction.create(destination_path, data) if self.children(source_path): for child_path in self.children(source_path): child_source_path = "{}/{}".format(source_path, child_path) child_destination_path = "{}/{}".format( destination_path, child_path) rename_element(transaction, child_source_path, child_destination_path) transaction.delete(source_path) for kkpair in kkpairs: if type(kkpair) is not tuple: self.log( "ZKHandler error: Key-key pair '{}' is not a tuple".format( kkpair), state="e", ) return False source_key = kkpair[0] source_path = self.get_schema_path(source_key) if source_path is None: # This path is invalid; this is likely due to missing schema entries, so continue continue destination_key = kkpair[1] destination_path = self.get_schema_path(destination_key) if destination_path is None: # This path is invalid; this is likely due to missing schema entries, so continue continue if not self.exists(source_key): self.log( "ZKHander error: Source key '{}' does not exist".format( source_path), state="e", ) return False if self.exists(destination_key): self.log( "ZKHander error: Destination key '{}' already exists". format(destination_path), state="e", ) return False rename_element(transaction, source_path, destination_path) try: transaction.commit() return True except Exception as e: self.log( "ZKHandler error: Failed to commit transaction: {}".format(e), state="e") return False # # Lock actions # def readlock(self, key): """ Acquires a read lock on a key """ count = 1 lock = None path = self.get_schema_path(key) while True: try: lock_id = str(uuid.uuid1()) lock = self.zk_conn.ReadLock(path, lock_id) break except NoNodeError: self.log( "ZKHandler warning: Failed to acquire read lock on nonexistent path {}" .format(path), state="e", ) return None except Exception as e: if count > 5: self.log( "ZKHandler warning: Failed to acquire read lock after 5 tries: {}" .format(e), state="e", ) break else: time.sleep(0.5) count += 1 continue return lock def writelock(self, key): """ Acquires a write lock on a key """ count = 1 lock = None path = self.get_schema_path(key) while True: try: lock_id = str(uuid.uuid1()) lock = self.zk_conn.WriteLock(path, lock_id) break except NoNodeError: self.log( "ZKHandler warning: Failed to acquire write lock on nonexistent path {}" .format(path), state="e", ) return None except Exception as e: if count > 5: self.log( "ZKHandler warning: Failed to acquire write lock after 5 tries: {}" .format(e), state="e", ) break else: time.sleep(0.5) count += 1 continue return lock def exclusivelock(self, key): """ Acquires an exclusive lock on a key """ count = 1 lock = None path = self.get_schema_path(key) while True: try: lock_id = str(uuid.uuid1()) lock = self.zk_conn.Lock(path, lock_id) break except NoNodeError: self.log( "ZKHandler warning: Failed to acquire exclusive lock on nonexistent path {}" .format(path), state="e", ) return None except Exception as e: if count > 5: self.log( "ZKHandler warning: Failed to acquire exclusive lock after 5 tries: {}" .format(e), state="e", ) break else: time.sleep(0.5) count += 1 continue return lock
class Applier: def __init__(self): self._zk = KazooClient(hosts=f'{os.getenv("ZOOKEEPER_HOST")}:2181') self._logger = logging.getLogger(__name__) self._logger.setLevel( logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))) ch = logging.StreamHandler() ch.setLevel(logging.getLevelName(os.getenv("LOG_LEVEL", "INFO"))) self._logger.addHandler(ch) def start(self): self._logger.debug("Applier started") self._zk.start() self._attempt_to_apply_next_target() scheduler = BlockingScheduler(timezone="UTC") scheduler.add_job(self._attempt_to_apply_next_target, 'interval', minutes=1) scheduler.start() def stop(self): self._zk.stop() def _attempt_to_apply_next_target(self): if (self._is_next_target_ready()): self._apply_next_target() def _apply_next_target(self): self._logger.info("Applying next target") self._zk.ensure_path(ZK_CURRENT_TARGET) next_target_id = self._zk.get(ZK_NEXT_TARGET)[0] tx = self._zk.transaction() tx.set_data(ZK_NEXT_TARGET, b'') tx.set_data(ZK_CURRENT_TARGET, next_target_id) tx.commit() def _is_next_target_ready(self): if (self._zk.exists(ZK_NEXT_TARGET) is None): return False next_target_id = self._zk.get(ZK_NEXT_TARGET)[0].decode() if (not next_target_id or self._zk.exists(f'/phrases/distributor/{next_target_id}') is None): return False partitions = self._zk.get_children( f'/phrases/distributor/{next_target_id}/partitions') if (not partitions): return False for partition in partitions: nodes_path = f'/phrases/distributor/{next_target_id}/partitions/{partition}/nodes' nodes = self._zk.get_children(nodes_path) if (len(nodes) < NUMBER_NODES_PER_PARTITION): return False for node in nodes: hostname = self._zk.get(f'{nodes_path}/{node}')[0].decode() if (not hostname): return False return True