def __init__(self, module, zk_srv_ip='127.0.0.1', zk_srv_port='2181', reset_config=False): self._reset_config = reset_config self._ds = None zk_endpts = [] for ip in zk_srv_ip.split(','): zk_endpts.append('%s:%s' %(ip, zk_srv_port)) ZookeeperClient.__init__(self, module, ','.join(zk_endpts)) self._zk = self._zk_client if reset_config: self.delete_node("/services", recursive=True) self.delete_node("/clients", recursive=True) self.delete_node("/election", recursive=True) # create default paths self.create_node("/services") self.create_node("/clients") self.create_node("/election") self._debug = { 'subscription_expires': 0, 'oos_delete': 0, 'db_excepts': 0, }
def main(args_str=None): global _zookeeper_client if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) # Initialize logger without introspect thread dm_logger = DeviceManagerLogger(args, http_server_port=-1) # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned vnc_amqp = DMAmqpHandle(dm_logger, DeviceManager.REACTION_MAP, args) vnc_amqp.establish() vnc_amqp.close() dm_logger.debug("Removed remained AMQP queue") _zookeeper_client = ZookeeperClient(client_pfx+"device-manager", args.zk_server_ip) dm_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx+"/device-manager", os.getpid(), run_device_manager, dm_logger, args)
def main(args_str=None): global _zookeeper_client if not args_str: args_str = " ".join(sys.argv[1:]) args = parse_args(args_str) _zookeeper_client = ZookeeperClient("svc-monitor", args.zk_server_ip) _zookeeper_client.master_election("/svc-monitor", os.getpid(), run_svc_monitor, args)
def main(args_str=None): if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) _disc_service = ZookeeperClient("svc-monitor", args.zk_server_ip) _disc_service.master_election("/svc-monitor", os.getpid(), run_svc_monitor, args)
def main(args_str=None): global _zookeeper_client if not args_str: args_str = " ".join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + "-" zk_path_pfx = args.cluster_id + "/" else: client_pfx = "" zk_path_pfx = "" _zookeeper_client = ZookeeperClient(client_pfx + "device-manager", args.zk_server_ip) _zookeeper_client.master_election(zk_path_pfx + "/device-manager", os.getpid(), run_device_manager, args)
def main(args_str=None, kube_api_skip=False, event_queue=None, vnc_kubernetes_config_dict=None): _zookeeper_client = None args = kube_args.parse_args(args_str) if 'kube_timer_interval' not in args: args.kube_timer_interval = '60' if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) km_logger = logger.KubeManagerLogger(args, http_server_port=-1) if args.nested_mode == '0': # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned rabbitmq_cfg = kube_args.rabbitmq_args(args) try: vnc_amqp = VncAmqpHandle(km_logger._sandesh, km_logger, DBBaseKM, REACTION_MAP, 'kube_manager', rabbitmq_cfg) vnc_amqp.establish() vnc_amqp.close() except Exception: pass finally: km_logger.debug("Removed remained AMQP queue") # Ensure zookeeper is up and running before starting kube-manager _zookeeper_client = ZookeeperClient(client_pfx+"kube-manager", args.zk_server_ip) km_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx+"/kube-manager", os.getpid(), run_kube_manager, km_logger, args, kube_api_skip, event_queue, vnc_kubernetes_config_dict) else: #nested mode, skip zookeeper mastership check run_kube_manager(km_logger, args, kube_api_skip, event_queue, vnc_kubernetes_config_dict)
def issu_zk_start(self): # Connect to old and new ZK servers self._zk_old = ZookeeperClient("zk issu client older version", self._Old_ZK_Version_Address, self._New_ZK_Version_Address) self._zk_old.set_lost_cb(self.issu_restart) self._zk_old.set_suspend_cb(self.issu_restart) self._zk_new = ZookeeperClient("zk issu client newer version", self._New_ZK_Version_Address, self._New_ZK_Version_Address) self._zk_new.set_lost_cb(self.issu_restart) self._zk_new.set_suspend_cb(self.issu_restart) old_prefix = self._Old_Prefix + "/" new_prefix = self._New_Prefix + "/" # Delete all state in new ZK if any if self._zk_new.exists(new_prefix): children = self._zk_new.get_children(new_prefix) for _path in children: if _path in self._Znode_Issu_List: self._logger( "Issu contrail zookeeper ,issu_zk_start, deleted paths" + str((new_prefix + str(_path))), level=SandeshLevel.SYS_INFO, ) self._zk_new.delete_node((new_prefix + str(_path)), True) else: continue; else: self._zk_new.create_node(new_prefix, "") if self._zk_old.exists(old_prefix): children = self._zk_old.get_children(old_prefix) elif self._zk_old.exists(self._Old_Prefix): children = self._zk_old.get_children(self._Old_Prefix) old_prefix = self._Old_Prefix for _path in children: # Ignore zookeeper replication if _path in self._Znode_Issu_List: new_path = new_prefix + str(_path) old_path = old_prefix + str(_path) time.sleep(1) self._zk_copy(old_path, new_path) else: continue self.issu_compare(new_prefix, old_prefix)
def main(args_str=None): global _zookeeper_client if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' _zookeeper_client = ZookeeperClient(client_pfx+"schema", args.zk_server_ip) _zookeeper_client.master_election(zk_path_pfx + "/schema-transformer", os.getpid(), run_schema_transformer, args)
def main(): #removing config_db_uuid,useragent keyspaces config = None server_list = [] config = ConfigParser.SafeConfigParser({'admin_token': None}) config.read('/etc/contrail/contrail-api.conf') server_list_str=config.get('DEFAULTS','cassandra_server_list') server_list=server_list_str.split() server_idx = 0 num_dbnodes = len(server_list) connected = False cass_server = None while not connected: try: cass_server = server_list[server_idx] sys_mgr = SystemManager(cass_server) connected = True except Exception as e: server_idx = (server_idx + 1) % num_dbnodes time.sleep(3) uuid_keyspace_name = 'config_db_uuid' agent_keyspace_name = 'useragent' try: print "deleting config_db_uuid keyspace" sys_mgr.drop_keyspace(uuid_keyspace_name) except pycassa.cassandra.ttypes.InvalidRequestException as e: print "Warning! " + str(e) try: print "deleting useragent keyspace" sys_mgr.drop_keyspace(agent_keyspace_name) except pycassa.cassandra.ttypes.InvalidRequestException as e: print "Warning! " + str(e) #deleting znodes _SUBNET_PATH = "/api-server/subnets" _FQ_NAME_TO_UUID_PATH = "/fq-name-to-uuid" _zk_client = None while True: try: _zk_client = ZookeeperClient("api-" + '0', '127.0.0.1:2181') break except gevent.event.Timeout as e: pass print "deleting nodes at ",_SUBNET_PATH _zk_client.delete_node(_SUBNET_PATH, True); print "deleting nodes at ",_FQ_NAME_TO_UUID_PATH _zk_client.delete_node(_FQ_NAME_TO_UUID_PATH, True);
def main(args_str=None): global _zookeeper_client if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) # Initialize logger without introspect thread sm_logger = ServiceMonitorLogger(args, http_server_port=-1) # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned rabbitmq_cfg = get_rabbitmq_cfg(args) try: vnc_amqp = VncAmqpHandle(sm_logger._sandesh, sm_logger, DBBaseSM, REACTION_MAP, 'svc_monitor', rabbitmq_cfg, args.trace_file) vnc_amqp.establish() vnc_amqp.close() except Exception: pass finally: sm_logger.debug("Removed remained AMQP queue") # Waiting to be elected as master node _zookeeper_client = ZookeeperClient( client_pfx+"svc-monitor", args.zk_server_ip) sm_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx+"/svc-monitor", os.getpid(), run_svc_monitor, sm_logger, args)
def main(args_str=None): global _zookeeper_client if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) args._args_list = args_str if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) # Initialize logger without introspect thread st_logger = SchemaTransformerLogger(args, http_server_port=-1) # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned vnc_amqp = STAmqpHandle(st_logger, SchemaTransformer.REACTION_MAP, args) vnc_amqp.establish() vnc_amqp.close() st_logger.debug("Removed remained AMQP queue") # Waiting to be elected as master node if 'host_ip' in args: host_ip = args.host_ip else: host_ip = socket.gethostbyname(socket.getfqdn()) _zookeeper_client = ZookeeperClient(client_pfx+"schema", args.zk_server_ip, host_ip, zk_timeout=args.zk_timeout) st_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx + "/schema-transformer", os.getpid(), run_schema_transformer, st_logger, args)
class ContrailZKIssu(): def __init__(self, Old_Version_Address, New_Version_Address, Old_Prefix, New_Prefix, Znode_Issu_List, logger): self._Old_ZK_Version_Address = Old_Version_Address self._New_ZK_Version_Address = New_Version_Address self._Old_Prefix = '/' + Old_Prefix self._New_Prefix = '/' + New_Prefix self._Znode_Issu_List = list(Znode_Issu_List) self._logger = logger self._logger( "Issu contrail zookeeper initialized...", level=SandeshLevel.SYS_INFO, ) # end __init__ # Create new path recursively def _zk_copy(self, old_v_path, new_v_path): children = self._zk_old.get_children(old_v_path) value = self._zk_old.read_node(old_v_path) self._logger( "Issu contrail zookeeper, _zk_copy, old version path" + str(old_v_path), level=SandeshLevel.SYS_DEBUG, ) self._logger( "Issu contrail zookeeper, _zk_copy, new version path" + str(new_v_path), level=SandeshLevel.SYS_DEBUG, ) self._zk_new.create_node(new_v_path, value) value = self._zk_new.read_node(new_v_path) self._logger( "Issu contrail zookeeper ,_zk_copy, new value" + str(value), level=SandeshLevel.SYS_DEBUG, ) for _path in children: new_path = str(new_v_path) + '/' + str(_path) old_path = str(old_v_path) + '/' + str(_path) self._zk_copy(old_path, new_path) # end _zk_copy def issu_compare(self, new_prefix, old_prefix): for _path in self._Znode_Issu_List: new_path = new_prefix + str(_path) old_path = old_prefix + str(_path) _new_children = self._zk_new.get_children(new_path) _old_children = self._zk_old.get_children(old_path) _new_children.sort() _old_children.sort() _result = cmp(_new_children, _old_children) if (_result == 0): continue else: self._logger( "Issu contrail zookeeper failed...", level=SandeshLevel.SYS_DEBUG, ) break self._logger( "Issu contrail zookeeper passed...", level=SandeshLevel.SYS_INFO, ) # end issu_compare def issu_zk_start(self): # Connect to old and new ZK servers self._zk_old = ZookeeperClient("zk issu client older version", self._Old_ZK_Version_Address) self._zk_old.set_lost_cb(self.issu_restart) self._zk_old.set_suspend_cb(self.issu_restart) self._zk_new = ZookeeperClient("zk issu client newer version", self._New_ZK_Version_Address) self._zk_new.set_lost_cb(self.issu_restart) self._zk_new.set_suspend_cb(self.issu_restart) old_prefix = self._Old_Prefix + "/" new_prefix = self._New_Prefix + "/" # Delete all state in new ZK if any if self._zk_new.exists(new_prefix): children = self._zk_new.get_children(new_prefix) for _path in children: if _path == "zookeeper": continue self._logger( "Issu contrail zookeeper ,issu_zk_start, deleted paths" + str((new_prefix + str(_path))), level=SandeshLevel.SYS_INFO, ) self._zk_new.delete_node((new_prefix + str(_path)), True) else: self._zk_new.create_node(new_prefix, "") if self._zk_old.exists(old_prefix): children = self._zk_old.get_children(old_prefix) elif self._zk_old.exists(self._Old_Prefix): children = self._zk_old.get_children(self._Old_Prefix) old_prefix = self._Old_Prefix for _path in children: # Ignore zookeeper replication if _path in self._Znode_Issu_List: new_path = new_prefix + str(_path) old_path = old_prefix + str(_path) time.sleep(1) self._zk_copy(old_path, new_path) else: continue self.issu_compare(new_prefix, old_prefix) # end issu_zk_start def issu_restart(self): # Call the ISSU start when connection to zk is lost in middle of ISSU self._logger( "Issu contrail zookeeper restarted...", level=SandeshLevel.SYS_INFO, ) # drop the zookeeper connection self._zk_old._zk_client.stop() self._zk_new._zk_client.stop() # Call the ISSU start again. self.issu_zk_start()
def __init__(self, amqp_client, db_conn, args, dm_logger): """Initialize ZooKeeper, RabbitMQ, Sandesh, DB conn etc.""" DeviceJobManager._instance = self self._amqp_client = amqp_client # create zk client for devicejobmanager with call_back self.client_reconnect_gl = None if args.zookeeper_ssl_enable: self._zookeeper_client = ZookeeperClient( "device-job-manager", args.zk_server_ip, args.host_ip, args.zookeeper_ssl_enable, args.zookeeper_ssl_keyfile, args.zookeeper_ssl_certificate, args.zookeeper_ssl_ca_cert) else: self._zookeeper_client = ZookeeperClient("device-job-manager", args.zk_server_ip, args.host_ip) self._zookeeper_client.set_lost_cb(self.client_reconnect) self._db_conn = db_conn self._args = args self._job_mgr_statistics = { 'max_job_count': self._args.max_job_count, 'running_job_count': 0 } # dict of exec_id:job_status (key/value pairs) self.job_status = {} # map of running job instances. Key is the pid and value is job # instance info self._job_mgr_running_instances = {} job_args = { 'collectors': self._args.collectors, 'fabric_ansible_conf_file': self._args.fabric_ansible_conf_file, 'host_ip': self._args.host_ip, 'zk_server_ip': self._args.zk_server_ip, 'cluster_id': self._args.cluster_id, 'zookeeper_ssl_enable': self._args.zookeeper_ssl_enable, 'zookeeper_ssl_keyfile': self._args.zookeeper_ssl_keyfile, 'zookeeper_ssl_certificate': self._args.zookeeper_ssl_certificate, 'zookeeper_ssl_ca_cert': self._args.zookeeper_ssl_ca_cert } self._job_args = json.dumps(job_args) # initialize the job logger self._job_log_utils = JobLogUtils( sandesh_instance_id="DeviceJobManager" + str(time.time()), config_args=self._job_args, sandesh_instance=dm_logger._sandesh) self._logger = self._job_log_utils.config_logger self._sandesh = self._logger._sandesh self._amqp_client.add_exchange(self.JOB_STATUS_EXCHANGE, type='direct') # add dummy consumer to initialize the exchange self._amqp_client.add_consumer( self.JOB_STATUS_CONSUMER + "dummy", self.JOB_STATUS_EXCHANGE, routing_key=self.JOB_STATUS_ROUTING_KEY + "dummy", auto_delete=True) self._amqp_client.add_exchange(self.JOB_REQUEST_EXCHANGE, type='direct') self._amqp_client.add_consumer( self.JOB_REQUEST_CONSUMER, self.JOB_REQUEST_EXCHANGE, routing_key=self.JOB_REQUEST_ROUTING_KEY, callback=self.handle_execute_job_request) abort_q_name = '.'.join( [self.JOB_ABORT_CONSUMER, socket.getfqdn(self._args.host_ip)]) self._amqp_client.add_consumer(abort_q_name, self.JOB_REQUEST_EXCHANGE, routing_key=self.JOB_ABORT_ROUTING_KEY, callback=self.handle_abort_job_request)
def main(args_str=None): global _amqp_client global _zookeeper_client if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) args.log_level = str(args.log_level) # Initialize logger without introspect thread dm_logger = DeviceManagerLogger(args, http_server_port=-1) # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned vnc_amqp = DMAmqpHandle(dm_logger, {}, args) vnc_amqp.establish() vnc_amqp.close() dm_logger.debug("Removed remaining AMQP queue from previous run") if 'host_ip' not in args: args.host_ip = socket.gethostbyname(socket.getfqdn()) _amqp_client = initialize_amqp_client(dm_logger, args) _zookeeper_client = ZookeeperClient(client_pfx + "device-manager", args.zk_server_ip, args.host_ip) _db_conn = initialize_db_connection(dm_logger, args) try: # Initialize the device job manager DeviceJobManager(_amqp_client, _zookeeper_client, _db_conn, args, dm_logger) # Allow kombu client to connect consumers gevent.sleep(0.5) except Exception as e: dm_logger.error("Error while initializing the device job " "manager %s" % str(e)) raise e try: # Initialize the device ztp manager DeviceZtpManager(_amqp_client, _db_conn, args, dm_logger) # Allow kombu client to connect consumers gevent.sleep(0.5) except Exception as e: dm_logger.error("Error while initializing the device ztp " "manager %s" % str(e)) raise e gevent.signal(signal.SIGHUP, sighup_handler) gevent.signal(signal.SIGTERM, sigterm_handler) gevent.signal(signal.SIGINT, sigterm_handler) dm_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx + "/device-manager", os.getpid(), run_device_manager, dm_logger, args)
class ContrailZKIssu(): def __init__(self, Old_Version_Address, New_Version_Address, Old_Prefix, New_Prefix, Znode_Issu_List, logger): self._Old_ZK_Version_Address = Old_Version_Address self._New_ZK_Version_Address = New_Version_Address self._Old_Prefix = '/' + Old_Prefix self._New_Prefix = '/' + New_Prefix self._Znode_Issu_List = list(Znode_Issu_List) self._logger = logger self._logger( "Issu contrail zookeeper initialized...", level=SandeshLevel.SYS_INFO, ) # end __init__ # Create new path recursively def _zk_copy(self, old_v_path, new_v_path): children = self._zk_old.get_children(old_v_path) value = self._zk_old.read_node(old_v_path) self._logger( "Issu contrail zookeeper, _zk_copy, old version path" + str(old_v_path), level=SandeshLevel.SYS_DEBUG, ) self._logger( "Issu contrail zookeeper, _zk_copy, new version path" + str(new_v_path), level=SandeshLevel.SYS_DEBUG, ) self._zk_new.create_node(new_v_path, value) value = self._zk_new.read_node(new_v_path) self._logger( "Issu contrail zookeeper ,_zk_copy, new value" + str(value), level=SandeshLevel.SYS_DEBUG, ) for _path in children: new_path = str(new_v_path) + '/' + str(_path) old_path = str(old_v_path) + '/' + str(_path) self._zk_copy(old_path, new_path) # end _zk_copy def issu_compare(self, new_prefix, old_prefix): for _path in self._Znode_Issu_List: new_path = new_prefix + str(_path) old_path = old_prefix + str(_path) _new_children = self._zk_new.get_children(new_path) _old_children = self._zk_old.get_children(old_path) _new_children.sort() _old_children.sort() _result = cmp(_new_children, _old_children) if (_result == 0): continue else: self._logger( "Issu contrail zookeeper failed...", level=SandeshLevel.SYS_DEBUG, ) break self._logger( "Issu contrail zookeeper passed...", level=SandeshLevel.SYS_INFO, ) # end issu_compare def issu_zk_start(self): # Connect to old and new ZK servers self._zk_old = ZookeeperClient("zk issu client older version", self._Old_ZK_Version_Address, self._New_ZK_Version_Address) self._zk_old.set_lost_cb(self.issu_restart) self._zk_old.set_suspend_cb(self.issu_restart) self._zk_new = ZookeeperClient("zk issu client newer version", self._New_ZK_Version_Address, self._New_ZK_Version_Address) self._zk_new.set_lost_cb(self.issu_restart) self._zk_new.set_suspend_cb(self.issu_restart) old_prefix = self._Old_Prefix + "/" new_prefix = self._New_Prefix + "/" # Delete all state in new ZK if any if self._zk_new.exists(new_prefix): children = self._zk_new.get_children(new_prefix) for _path in children: if _path in self._Znode_Issu_List: self._logger( "Issu contrail zookeeper ,issu_zk_start, deleted paths" + str((new_prefix + str(_path))), level=SandeshLevel.SYS_INFO, ) self._zk_new.delete_node((new_prefix + str(_path)), True) else: continue; else: self._zk_new.create_node(new_prefix, "") if self._zk_old.exists(old_prefix): children = self._zk_old.get_children(old_prefix) elif self._zk_old.exists(self._Old_Prefix): children = self._zk_old.get_children(self._Old_Prefix) old_prefix = self._Old_Prefix for _path in children: # Ignore zookeeper replication if _path in self._Znode_Issu_List: new_path = new_prefix + str(_path) old_path = old_prefix + str(_path) time.sleep(1) self._zk_copy(old_path, new_path) else: continue self.issu_compare(new_prefix, old_prefix) # end issu_zk_start def issu_restart(self): # Call the ISSU start when connection to zk is lost in middle of ISSU self._logger( "Issu contrail zookeeper restarted...", level=SandeshLevel.SYS_INFO, ) # drop the zookeeper connection self._zk_old._zk_client.stop() self._zk_new._zk_client.stop() # Call the ISSU start again. self.issu_zk_start()
def main(args_str=None): global _amqp_client global _zookeeper_client global _object_db if not args_str: args_str = ' '.join(sys.argv[1:]) args = parse_args(args_str) if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) # Initialize logger without introspect thread dm_logger = DeviceManagerLogger(args, http_server_port=-1) # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned vnc_amqp = DMAmqpHandle(dm_logger, {}, args) vnc_amqp.establish() vnc_amqp.close() dm_logger.debug("Removed remaining AMQP queue from previous run") if 'host_ip' in args: host_ip = args.host_ip else: host_ip = socket.gethostbyname(socket.getfqdn()) _amqp_client = initialize_amqp_client(dm_logger, args) _zookeeper_client = ZookeeperClient(client_pfx+"device-manager", args.zk_server_ip, host_ip) _object_db = DMCassandraDB.get_instance(_zookeeper_client, args, dm_logger) try: # Initialize the device job manager DeviceJobManager(_object_db, _amqp_client, _zookeeper_client, args, dm_logger) except Exception as e: dm_logger.error("Error while initializing the device job " "manager %s" % repr(e)) try: # Initialize the device ztp manager DeviceZtpManager(_amqp_client, args, dm_logger) except Exception as e: dm_logger.error("Error while initializing the device ztp " "manager %s" % repr(e)) gevent.signal(signal.SIGHUP, sighup_handler) gevent.signal(signal.SIGTERM, sigterm_handler) gevent.signal(signal.SIGINT, sigterm_handler) dm_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election(zk_path_pfx+"/device-manager", os.getpid(), run_device_manager, dm_logger, args)
class DeviceJobManager(object): JOB_REQUEST_EXCHANGE = "job_request_exchange" JOB_REQUEST_CONSUMER = "job_request_consumer" JOB_REQUEST_ROUTING_KEY = "job.request" JOB_ABORT_CONSUMER = "job_abort_consumer" JOB_ABORT_ROUTING_KEY = "job.abort" JOB_STATUS_EXCHANGE = "job_status_exchange" JOB_STATUS_CONSUMER = "job_status_consumer." JOB_STATUS_ROUTING_KEY = "job.status." JOB_STATUS_TTL = 5 * 60 FABRIC_ZK_LOCK = "fabric-job-monitor" _instance = None def __init__(self, amqp_client, db_conn, args, dm_logger): """Initialize ZooKeeper, RabbitMQ, Sandesh, DB conn etc.""" DeviceJobManager._instance = self self._amqp_client = amqp_client # create zk client for devicejobmanager with call_back self.client_reconnect_gl = None self._zookeeper_client = ZookeeperClient("device-job-manager", args.zk_server_ip, args.host_ip) self._zookeeper_client.set_lost_cb(self.client_reconnect) self._db_conn = db_conn self._args = args self._job_mgr_statistics = { 'max_job_count': self._args.max_job_count, 'running_job_count': 0 } # dict of exec_id:job_status (key/value pairs) self.job_status = {} # map of running job instances. Key is the pid and value is job # instance info self._job_mgr_running_instances = {} job_args = { 'collectors': self._args.collectors, 'fabric_ansible_conf_file': self._args.fabric_ansible_conf_file, 'host_ip': self._args.host_ip, 'zk_server_ip': self._args.zk_server_ip, 'cluster_id': self._args.cluster_id } self._job_args = json.dumps(job_args) # initialize the job logger self._job_log_utils = JobLogUtils( sandesh_instance_id="DeviceJobManager" + str(time.time()), config_args=self._job_args, sandesh_instance=dm_logger._sandesh) self._logger = self._job_log_utils.config_logger self._sandesh = self._logger._sandesh self._amqp_client.add_exchange(self.JOB_STATUS_EXCHANGE, type='direct') # add dummy consumer to initialize the exchange self._amqp_client.add_consumer( self.JOB_STATUS_CONSUMER + "dummy", self.JOB_STATUS_EXCHANGE, routing_key=self.JOB_STATUS_ROUTING_KEY + "dummy", auto_delete=True) self._amqp_client.add_exchange(self.JOB_REQUEST_EXCHANGE, type='direct') self._amqp_client.add_consumer( self.JOB_REQUEST_CONSUMER, self.JOB_REQUEST_EXCHANGE, routing_key=self.JOB_REQUEST_ROUTING_KEY, callback=self.handle_execute_job_request) abort_q_name = '.'.join( [self.JOB_ABORT_CONSUMER, socket.getfqdn(self._args.host_ip)]) self._amqp_client.add_consumer(abort_q_name, self.JOB_REQUEST_EXCHANGE, routing_key=self.JOB_ABORT_ROUTING_KEY, callback=self.handle_abort_job_request) # end __init__ @classmethod def get_instance(cls): return cls._instance # end get_instance @classmethod def destroy_instance(cls): inst = cls.get_instance() if not inst: return cls._instance = None # end destroy_instance def client_reconnect(self): if self.client_reconnect_gl is None: self.client_reconnect_gl =\ vnc_greenlets.VncGreenlet("djm reconnect", self.zk_reconnect) # end client_reconnect def zk_reconnect(self): self._zookeeper_client.connect() self.client_reconnect_gl = None def db_read(self, obj_type, obj_id, obj_fields=None, ret_readonly=False): try: (ok, cassandra_result) = self._db_conn.object_read( obj_type, [obj_id], obj_fields, ret_readonly=ret_readonly) except NoIdError as e: # if NoIdError is for obj itself (as opposed to say for parent # or ref), let caller decide if this can be handled gracefully # by re-raising if e._unknown_id == obj_id: raise return (False, str(e)) return (ok, cassandra_result[0]) # end db_read def is_max_job_threshold_reached(self): if self._job_mgr_statistics.get('running_job_count') < \ self._job_mgr_statistics.get('max_job_count'): return False return True # end is_max_job_threshold_reached def publish_job_status_notification(self, job_execution_id, status): try: msg = {'job_execution_id': job_execution_id, 'job_status': status} self._amqp_client.publish(msg, self.JOB_STATUS_EXCHANGE, routing_key=self.JOB_STATUS_ROUTING_KEY + job_execution_id, serializer='json', retry=True, retry_policy={ 'max_retries': 5, 'interval_start': 2, 'interval_step': 3, 'interval_max': 15 }, expiration=self.JOB_STATUS_TTL) except Exception: self._logger.error("Failed to send job status change notification" " %s %s" % (job_execution_id, status)) # end publish_job_status_notification def get_job_template_id(self, job_template_fq_name): try: return self._db_conn.fq_name_to_uuid("job_template", job_template_fq_name) except Exception as e: msg = "Error while reading job_template_id: " + str(e) self._logger.error(msg) raise # end get_job_template_id def handle_execute_job_request(self, body, message): job_input_params = body job_execution_id = job_input_params.get('job_execution_id') # check if the max job processing threshold is reached if not self.is_max_job_threshold_reached(): message.ack() self._logger.info("SENT JOB REQUEST: {}".format(job_execution_id)) else: # requeue the message if the max threshold is reached, to be picked # by another job manager or wait until the current job mgr is free message.reject(requeue=True) self._logger.info( "REQUEUE JOB REQUEST: {}".format(job_execution_id)) gevent.sleep(1) return acfg = job_input_params.get('input', {}).get('device_abstract_config') if acfg: job_input_params['input']['device_abstract_config'] = \ json.loads(acfg) update_uve_on_failure = False device_list = None extra_params = job_input_params.get('params') if extra_params is not None: device_list = extra_params.get('device_list') is_delete = job_input_params.get('input').get('is_delete') job_template_fq_name = job_input_params.get('job_template_fq_name') job_template_id = job_input_params.get('job_template_id') fabric_fq_name = None fabric_job_uve_name = '' job_input_params['vnc_api_init_params'] = { "admin_user": self._args.admin_user, "admin_password": self._args.admin_password, "admin_tenant_name": self._args.admin_tenant_name, "api_server_port": self._args.api_server_port, "api_server_use_ssl": self._args.api_server_use_ssl } try: # populate job template id if not present in input_param if job_template_id is None: job_template_id = self.get_job_template_id( job_template_fq_name) job_input_params["job_template_id"] = job_template_id # read the device object and pass the necessary data to the job if device_list: self.read_device_data(device_list, job_input_params, job_execution_id, is_delete) else: self.read_fabric_data(job_input_params, job_execution_id, is_delete) # read the job concurrency level from job template job_concurrency = self.get_job_concurrency(job_template_id, job_execution_id) job_input_params['job_concurrency'] = job_concurrency fabric_fq_name = job_input_params.get('fabric_fq_name') fabric_job_uve_name_list = job_template_fq_name fabric_job_uve_name_list.insert(0, fabric_fq_name) fabric_job_uve_name = ':'.join(map(str, fabric_job_uve_name_list)) device_fqnames = [] # create the UVE if fabric_fq_name != "__DEFAULT__" and not device_list: self.create_fabric_job_uve( fabric_job_uve_name, job_input_params.get('job_execution_id'), JobStatus.STARTING.value, 0.0) if device_list: device_fqnames = self.create_physical_router_job_uve( device_list, job_input_params, fabric_job_uve_name, JobStatus.STARTING.value, 0.0) # after creating the UVE, flag indicates to update the # UVE upon any failures update_uve_on_failure = True # check if there is any other job running for the fabric if job_concurrency is not None and job_concurrency == "fabric": existing_job = self._is_existing_job_for_fabric( fabric_fq_name, job_execution_id) if existing_job: msg = "Another job for the same fabric is in" \ " progress. Please wait for the job to finish" self.mark_failure(msg, job_template_fq_name, job_execution_id, fabric_fq_name, mark_uve=True, device_list=device_list, fabric_job_uve_name=fabric_job_uve_name, job_params=job_input_params) return start_time = time.time() signal_var = { 'fabric_name': fabric_job_uve_name, 'fabric_fq_name': fabric_fq_name, 'start_time': start_time, 'exec_id': job_execution_id, 'device_fqnames': device_fqnames, 'job_concurrency': job_concurrency } self.job_status.update( {job_execution_id: JobStatus.STARTING.value}) self.publish_job_status_notification(job_execution_id, JobStatus.STARTING.value) # handle process exit signal signal.signal(signal.SIGCHLD, self.job_mgr_signal_handler) # write the abstract config to file if needed self.save_abstract_config(job_input_params) # add params needed for sandesh connection job_input_params['args'] = self._job_args # create job manager subprocess job_mgr_path = os.path.dirname( __file__) + "/../job_manager/job_mgr.py" job_process = subprocess.Popen( ["python", job_mgr_path, "-i", json.dumps(job_input_params)], cwd="/", close_fds=True) self._job_mgr_running_instances[str(job_process.pid)] = signal_var self._job_mgr_statistics['running_job_count'] = len( self._job_mgr_running_instances) self._logger.notice("Created job manager process. Execution id: " "%s" % job_execution_id) self._logger.info( "Current number of job_mgr processes running %s" % self._job_mgr_statistics.get('running_job_count')) except Exception as e: msg = "Exception while processing the job request %s %s %s : " \ "%s %s" % (job_template_fq_name, job_execution_id, fabric_fq_name, repr(e), traceback.format_exc()) self.mark_failure(msg, job_template_fq_name, job_execution_id, fabric_fq_name, mark_uve=update_uve_on_failure, device_list=device_list, fabric_job_uve_name=fabric_job_uve_name, job_params=job_input_params) # end handle_execute_job_request def _abort_job(self, pid, job_instance, abort_mode): self._logger.info("ABORT: pid={}, job_instance={}, mode={}".format( pid, job_instance, abort_mode)) # Force abort or graceful abort os.kill(int(pid), signal.SIGABRT if abort_mode == "force" else signal.SIGUSR1) def handle_abort_job_request(self, body, message): message.ack() inp = body.get('input') job_execution_ids = inp.get('job_execution_ids') abort_mode = inp.get('abort_mode') self._logger.info("Abort job request: job_ids={}, mode={}".format( job_execution_ids, abort_mode)) # Search through running job instances to find this job for pid, job_instance in list(self._job_mgr_running_instances.items()): # Abort one job if job_execution_ids: if job_instance.get('exec_id') in job_execution_ids: self._abort_job(pid, job_instance, abort_mode) # Abort next job else: self._abort_job(pid, job_instance, abort_mode) # end handle_abort_job_request def create_fabric_job_uve(self, fabric_job_uve_name, execution_id, job_status, percentage_completed): job_execution_data = FabricJobExecution( name=fabric_job_uve_name, execution_id=execution_id, job_start_ts=int(round(time.time() * 1000)), job_status=job_status, percentage_completed=percentage_completed) job_execution_uve = FabricJobUve(data=job_execution_data, sandesh=self._sandesh) job_execution_uve.send(sandesh=self._sandesh) # end create_fabric_job_uve def create_physical_router_job_uve(self, device_list, job_input_params, fabric_job_uve_name, job_status, percentage_completed): device_fqnames = [] for device_id in device_list: device_fqname = job_input_params.get('device_json').get( device_id).get('device_fqname') device_fqname = ':'.join(map(str, device_fqname)) prouter_uve_name = device_fqname + ":" + \ fabric_job_uve_name prouter_job_data = PhysicalRouterJobExecution( name=prouter_uve_name, execution_id=job_input_params.get('job_execution_id'), job_start_ts=int(round(time.time() * 1000)), job_status=job_status, percentage_completed=percentage_completed) prouter_job_uve = PhysicalRouterJobUve(data=prouter_job_data, sandesh=self._sandesh) prouter_job_uve.send(sandesh=self._sandesh) device_fqnames.append(prouter_uve_name) return device_fqnames # end create_physical_router_job_uve def mark_failure(self, msg, job_template_fq_name, job_execution_id, fabric_fq_name, mark_uve=True, device_list=None, fabric_job_uve_name=None, job_params=None): self._logger.error("Marked job as failed %s %s %s " % (job_template_fq_name, job_execution_id, msg)) # send job object log for failure self._job_log_utils.send_job_log(job_template_fq_name, job_execution_id, fabric_fq_name, msg, JobStatus.FAILURE.value) # update the in memory job status for the job self.job_status[job_execution_id] = JobStatus.FAILURE.value self.publish_job_status_notification(job_execution_id, JobStatus.FAILURE.value) # update the UVE if mark_uve: if fabric_fq_name != "__DEFAULT__" and not device_list: self.create_fabric_job_uve(fabric_job_uve_name, job_execution_id, JobStatus.FAILURE.value, 100.0) if device_list: self.create_physical_router_job_uve(device_list, job_params, fabric_job_uve_name, JobStatus.FAILURE.value, 100.0) # end mark_failure def _load_job_log(self, marker, input_str): json_str = input_str.split(marker)[1] try: return json.loads(json_str) except ValueError: return ast.literal_eval(json_str) # end _load_job_log def _extracted_file_output(self, execution_id): status = "FAILURE" prouter_info = {} device_op_results = {} failed_devices_list = [] try: with open("/tmp/" + execution_id, "r") as f_read: for line in f_read: if 'PROUTER_LOG##' in line: job_log = self._load_job_log('PROUTER_LOG##', line) fqname = ":".join(job_log.get('prouter_fqname')) prouter_info[fqname] = job_log.get('onboarding_state') if line.startswith('job_summary'): job_log = self._load_job_log('JOB_LOG##', line) status = job_log.get('job_status') failed_devices_list = job_log.get( 'failed_devices_list') if 'GENERIC_DEVICE##' in line: job_log = self._load_job_log('GENERIC_DEVICE##', line) device_name = job_log.get('device_name') device_op_results[device_name] = job_log.get( 'command_output') except Exception as e: msg = "File corresponding to execution id %s not found: %s\n%s" % ( execution_id, str(e), traceback.format_exc()) self._logger.error(msg) return status, prouter_info, device_op_results, failed_devices_list # end _extracted_file_output def job_mgr_signal_handler(self, signalnum, frame): pid = None signal_var = None try: # get the child process id that called the signal handler pid = os.waitpid(-1, os.WNOHANG) signal_var = self._job_mgr_running_instances.get(str(pid[0])) if not signal_var: self._logger.error( "Job mgr process %s not found in the instance " "map" % str(pid)) return msg = "Entered job_mgr_signal_handler for: %s" % signal_var self._logger.notice(msg) exec_id = signal_var.get('exec_id') status, prouter_info, device_op_results, failed_devices_list = \ self._extracted_file_output(exec_id) self.job_status[exec_id] = status self.publish_job_status_notification(exec_id, status) if signal_var.get('fabric_name') != "__DEFAULT__"\ and not signal_var.get('device_fqnames'): job_execution_data = FabricJobExecution( name=signal_var.get('fabric_name'), job_status=status, percentage_completed=100) job_execution_uve = FabricJobUve(data=job_execution_data, sandesh=self._sandesh) job_execution_uve.send(sandesh=self._sandesh) else: for prouter_uve_name in signal_var.get('device_fqnames'): prouter_status = status device_name = prouter_uve_name.split(":")[1] if device_name in failed_devices_list: prouter_status = "FAILURE" prouter_job_data = PhysicalRouterJobExecution( name=prouter_uve_name, job_status=prouter_status, percentage_completed=100, device_op_results=json.dumps( device_op_results.get(device_name, {}))) prouter_job_uve = PhysicalRouterJobUve( data=prouter_job_data, sandesh=self._sandesh) prouter_job_uve.send(sandesh=self._sandesh) for k, v in list(prouter_info.items()): prouter_uve_name = "%s:%s" % (k, signal_var.get('fabric_name')) prouter_job_data = PhysicalRouterJobExecution( name=prouter_uve_name, execution_id=exec_id, job_start_ts=int(round( signal_var.get('start_time') * 1000)), prouter_state=v) prouter_job_uve = PhysicalRouterJobUve(data=prouter_job_data, sandesh=self._sandesh) prouter_job_uve.send(sandesh=self._sandesh) self._clean_up_job_data(signal_var, str(pid[0])) self._logger.info( "Job : %s finished. Current number of job_mgr " "processes running now %s " % (signal_var, self._job_mgr_statistics['running_job_count'])) except OSError as process_error: self._logger.error("Could not retrieve the child process id. " "OS call returned with error %s" % str(process_error)) except Exception as unknown_exception: self._clean_up_job_data(signal_var, str(pid[0])) self._logger.error("Failed in job signal handler %s" % str(unknown_exception)) # end job_mgr_signal_handler def _clean_up_job_data(self, signal_var, pid): # remove the pid entry of the processed job_mgr process del self._job_mgr_running_instances[pid] # clean up fabric level lock if signal_var.get('job_concurrency') \ is not None and signal_var.get('job_concurrency') == "fabric": self._release_fabric_job_lock(signal_var.get('fabric_fq_name')) self._cleanup_job_lock(signal_var.get('fabric_fq_name')) self._job_mgr_statistics['running_job_count'] = len( self._job_mgr_running_instances) # end _clean_up_job_data def _is_existing_job_for_fabric(self, fabric_fq_name, job_execution_id): is_fabric_job_running = False # build the zk lock path fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \ self.FABRIC_ZK_LOCK # check if the lock is already taken if not taken, acquire the lock # by creating a node try: self._zookeeper_client.create_node(fabric_node_path, value=job_execution_id, ephemeral=True) self._logger.info("Acquired fabric lock" " for %s " % fabric_node_path) except ResourceExistsError: # means the lock was acquired by some other job value = self._zookeeper_client.read_node(fabric_node_path) self._logger.error("Fabric lock is already acquired by" " job %s " % value) is_fabric_job_running = True return is_fabric_job_running # end _is_existing_job_for_fabric def _release_fabric_job_lock(self, fabric_fq_name): # build the zk lock path fabric_node_path = '/job-manager/' + fabric_fq_name + '/' + \ self.FABRIC_ZK_LOCK try: self._zookeeper_client.delete_node(fabric_node_path) self._logger.info("Released fabric lock" " for %s " % fabric_node_path) except Exception as zk_error: self._logger.error("Exception while releasing the zookeeper lock" " %s " % repr(zk_error)) # end _release_fabric_job_lock def _cleanup_job_lock(self, fabric_fq_name): fabric_node_path = '/job-manager/' + fabric_fq_name try: if not self._zookeeper_client.get_children(fabric_node_path): self._zookeeper_client.delete_node(fabric_node_path) self._logger.info("Released fabric node" " for %s " % fabric_node_path) except Exception as zk_error: self._logger.error("Exception while releasing the fabric node for " "%s: %s " % (fabric_node_path, str(zk_error))) # end _cleanup_job_lock def save_abstract_config(self, job_params): # Saving device abstract config to a local file as it could be large # config. There will be one local file per device and this file gets # removed when device is removed from database. dev_abs_cfg = job_params.get('input', {}).get('device_abstract_config') if dev_abs_cfg: dev_mgt_ip = dev_abs_cfg.get('system', {}).get('management_ip') if not dev_mgt_ip: raise ValueError('Missing management IP in abstract config') dev_cfg_dir = '/opt/contrail/fabric_ansible_playbooks/config/' +\ dev_mgt_ip if not os.path.exists(dev_cfg_dir): os.makedirs(dev_cfg_dir) if dev_cfg_dir: with open(dev_cfg_dir + '/abstract_cfg.json', 'w') as f: f.write(json.dumps(dev_abs_cfg, indent=4)) job_params.get('input').pop('device_abstract_config') # end save_abstract_config def get_job_concurrency(self, job_template_id, job_exec_id): (ok, result) = self.db_read("job-template", job_template_id, ['job_template_concurrency_level']) if not ok: msg = "Error while reading the job concurrency " \ "from the job template with id %s : %s" %\ (job_template_id, result) raise JobException(msg, job_exec_id) return result.get('job_template_concurrency_level') # end get_job_concurrency def read_device_data(self, device_list, request_params, job_exec_id, is_delete=False): device_data = dict() for device_id in device_list: if not is_delete: try: (ok, result) = self.db_read("physical-router", device_id, [ 'physical_router_user_credentials', 'physical_router_management_ip', 'fq_name', 'physical_router_device_family', 'physical_router_vendor_name', 'physical_router_product_name', 'fabric_refs' ]) if not ok: msg = "Error while reading the physical router " \ "with id %s : %s" % (device_id, result) raise JobException(msg, job_exec_id) except NoIdError as ex: msg = "Device not found" \ "%s: %s" % (device_id, str(ex)) raise JobException(msg, job_exec_id) except Exception as e: msg = "Exception while reading device %s %s " % \ (device_id, str(e)) raise JobException(msg, job_exec_id) device_fq_name = result.get('fq_name') device_mgmt_ip = result.get('physical_router_management_ip') user_cred = result.get('physical_router_user_credentials') device_family = result.get("physical_router_device_family") device_vendor_name = result.get("physical_router_vendor_name") device_product_name = result.get( "physical_router_product_name") fabric_refs = result.get('fabric_refs') if fabric_refs: fabric_fq_name = result.get('fabric_refs')[0].get('to') fabric_fq_name_str = ':'.join(fabric_fq_name) request_params['fabric_fq_name'] = fabric_fq_name_str else: device_mgmt_ip = request_params.get( 'input', {}).get('device_management_ip') device_abs_cfg = request_params.get( 'input', {}).get('device_abstract_config') system = device_abs_cfg.get('system', {}) device_name = system.get('name') device_username = system.get('credentials', {}).get('user_name') device_password = system.get('credentials', {}).get('password') user_cred = { "username": device_username, "password": device_password } device_family = system.get('device_family') device_vendor_name = system.get('vendor_name') device_product_name = system.get('product_name') device_fq_name = ["default-global-system-config", device_name] self.read_fabric_data(request_params, job_exec_id, is_delete) device_json = {"device_management_ip": device_mgmt_ip} device_json.update({"device_fqname": device_fq_name}) if user_cred: device_json.update( {"device_username": user_cred.get('username')}) decrypt_password = JobVncApi.decrypt_password( encrypted_password=user_cred.get('password'), pwd_key=device_id) device_json.update({"device_password": decrypt_password}) if device_family: device_json.update({"device_family": device_family}) if device_vendor_name: device_json.update({"device_vendor": device_vendor_name}) if device_product_name: device_json.update({"device_product": device_product_name}) device_data.update({device_id: device_json}) if len(device_data) > 0: request_params.update({"device_json": device_data}) # end read_device_data def read_fabric_data(self, request_params, job_execution_id, is_delete=False): if request_params.get('input') is None: err_msg = "Missing job input" raise JobException(err_msg, job_execution_id) fabric_fq_name = None if request_params.get('input').get('fabric_fq_name'): fabric_fq_name = request_params.get('input').get('fabric_fq_name') elif request_params.get('input').get('fabric_uuid'): # get the fabric fq_name from the db if fabric_uuid is provided fabric_uuid = request_params.get('input').get('fabric_uuid') try: fabric_fq_name = self._db_conn.uuid_to_fq_name(fabric_uuid) except NoIdError as e: raise JobException(str(e), job_execution_id) else: if "device_deletion_template" in request_params.get( 'job_template_fq_name'): fabric_fq_name = ["__DEFAULT__"] elif not is_delete: err_msg = "Missing fabric details in the job input" raise JobException(err_msg, job_execution_id) if fabric_fq_name: fabric_fq_name_str = ':'.join(map(str, fabric_fq_name)) request_params['fabric_fq_name'] = fabric_fq_name_str
def main(args_str=None, kube_api_skip=False, event_queue=None, vnc_kubernetes_config_dict=None): _zookeeper_client = None args = kube_args.parse_args(args_str) if 'kube_timer_interval' not in args: args.kube_timer_interval = '60' if args.cluster_id: client_pfx = args.cluster_id + '-' zk_path_pfx = args.cluster_id + '/' else: client_pfx = '' zk_path_pfx = '' # randomize collector list args.random_collectors = args.collectors if args.collectors: args.random_collectors = random.sample(args.collectors, len(args.collectors)) km_logger = common_logger.KubeManagerLogger(args, http_server_port=-1) if args.nested_mode == '0': # Initialize AMQP handler then close it to be sure remain queue of a # precedent run is cleaned rabbitmq_cfg = kube_args.rabbitmq_args(args) try: vnc_amqp = VncAmqpHandle( km_logger._sandesh, km_logger, DBBaseKM, REACTION_MAP, 'kube_manager', rabbitmq_cfg ) vnc_amqp.establish() vnc_amqp.close() except Exception: # FIXME: Except clause is too broad pass finally: km_logger.debug("Removed remained AMQP queue") # Ensure zookeeper is up and running before starting kube-manager _zookeeper_client = ZookeeperClient(client_pfx+"kube-manager", args.zk_server_ip) km_logger.notice("Waiting to be elected as master...") _zookeeper_client.master_election( zk_path_pfx + "/kube-manager", os.getpid(), run_kube_manager, km_logger, args, kube_api_skip, event_queue, vnc_kubernetes_config_dict) else: # nested mode, skip zookeeper mastership check run_kube_manager(km_logger, args, kube_api_skip, event_queue, vnc_kubernetes_config_dict)