def bounce_lock_zookeeper( name: str, system_paasta_config: Optional[SystemPaastaConfig] = None) -> Iterator: """Acquire a bounce lock in zookeeper for the name given. The name should generally be the service namespace being bounced. This is a contextmanager. Please use it via 'with bounce_lock(name):'. :param name: The lock name to acquire""" if system_paasta_config is None: system_paasta_config = load_system_paasta_config() zk = KazooClient( hosts=system_paasta_config.get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S, ) zk.start() lock = zk.Lock(f"{ZK_LOCK_PATH}/{name}") try: lock.acquire( timeout=1) # timeout=0 throws some other strange exception yield except LockTimeout: raise LockHeldException("Service %s is already being bounced!" % name) else: lock.release() finally: zk.stop() zk.close()
class ZK: client = None def __init__(self, zk_host): self.client = KazooClient(zk_host) self.client.start() def __del__(self): self.client.stop() def get_node(self, path): if not self.client.exists(path): return None node = ZKNode(path, self) return node def create_node(self, path): self.client.ensure_path(path) return self.get_node(path) def get_transaction(self): return self.client.transaction() def get_lock(self, path, id=None): return self.client.Lock(path + "/lock", id) def has_lock(self, path): lock_path = path + "/lock" if not self.client.exists(lock_path): return False if len(self.client.get_children(lock_path)) > 0: return True else: return False
class ZkHelper(object): def __init__(self, address='', port=''): assert address and port self.zk_address = address self.zk_port = port self.retry = KazooRetry(max_delay=10000, max_tries=None) self.zk = KazooClient(hosts='%s:%s' % (self.zk_address, self.zk_port), connection_retry=self.retry, timeout=20) self.zk.add_listener(self._listener) self.zk.start() logging.info("instance zk client start (%s:%s)" % (self.zk_address, self.zk_port)) @staticmethod def _listener(state): if state == KazooState.LOST: logging.info( "zk connect lost, stop this connection and then start new one!" ) elif state == KazooState.SUSPENDED: logging.info( "zk connect suspended, stop this connection and then start new one!" ) def write(self, path, data): self.zk.ensure_path(path) self.retry(self.zk.set, path, data) logging.info("write data:%s to path:%s" % (data, path)) def ensure_path(self, path): self.zk.ensure_path(path) def get_lock(self, path): return self.zk.Lock(path, threading.currentThread()) def read(self, path): if self.zk.exists(path): data = self.retry(self.zk.get, path) logging.info("read data:%s from path:%s" % (data, path)) return data[0] logging.info("path:%s not exist" % path) def get_children_list(self, path): if self.zk.exists(path): data = self.retry(self.zk.get_children, path) logging.info("get children:%s from path:%s" % (data, path)) return data logging.info("path:%s not exist" % path) def exists(self, path): return self.zk.exists(path) def get_lock(self, path): lock = self.retry(self.zk.Lock, path, threading.current_thread()) return lock def close(self): self.zk.close()
def main(): """ Starts the AdminServer. """ logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('-p', '--port', type=int, default=constants.DEFAULT_PORT, help='The port to listen on') parser.add_argument('-v', '--verbose', action='store_true', help='Output debug-level logging') args = parser.parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) options.define('secret', appscale_info.get_secret()) options.define('login_ip', appscale_info.get_login_ip()) options.define('private_ip', appscale_info.get_private_ip()) acc = appscale_info.get_appcontroller_client() ua_client = UAClient(appscale_info.get_db_master_ip(), options.secret) zk_client = KazooClient(hosts=','.join(appscale_info.get_zk_node_ips()), connection_retry=ZK_PERSISTENT_RECONNECTS) zk_client.start() version_update_lock = zk_client.Lock(constants.VERSION_UPDATE_LOCK_NODE) thread_pool = ThreadPoolExecutor(4) monit_operator = MonitOperator() all_resources = { 'acc': acc, 'ua_client': ua_client, 'zk_client': zk_client, 'version_update_lock': version_update_lock, 'thread_pool': thread_pool } if options.private_ip in appscale_info.get_taskqueue_nodes(): logging.info('Starting push worker manager') GlobalPushWorkerManager(zk_client, monit_operator) app = web.Application([ ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions', VersionsHandler, all_resources), ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions/([a-z0-9-]+)', VersionHandler, all_resources), ('/v1/apps/([a-z0-9-]+)/operations/([a-z0-9-]+)', OperationsHandler), ('/api/queue/update', UpdateQueuesHandler, { 'zk_client': zk_client }) ]) logging.info('Starting AdminServer') app.listen(args.port) io_loop = IOLoop.current() io_loop.start()
class TimeStampLeader(PublicationLeader): def __init__(self, zk_hosts, bdii_path): self.zk_hosts = zk_hosts self.zk = KazooClient(self.zk_hosts) self.bdii_path = bdii_path self.is_leader = False def pack_ts(self, input_dt): return struct.pack('f', self.gen_ts(input_dt)) def gen_ts(self, input_dt): return time.mktime(input_dt.timetuple()) def does_exist(self): if self.zk.exists(self.bdii_path) is not None: return True else: return False def is_stale(self, current_time): data, stat = self.zk.get(self.bdii_path) if data == '': return True last_updated_timestamp = struct.unpack('f',data)[0] if last_updated_timestamp <= (self.gen_ts(current_time) - 120): return True else: return False def should_publish(self): self.zk.start() current_time = datetime.datetime.utcnow() if not self.does_exist(): self.zk.create(self.bdii_path, self.pack_ts(current_time)) self.is_leader = True return self.is_leader bdii_lock = self.zk.Lock(self.bdii_path, socket.getfqdn()) try: lock_acquired = bdii_lock.acquire(5.0) if lock_acquired: self.is_leader = self.is_stale(current_time) bdii_lock.release() self.zk.stop() return self.is_leader except LockTimeout: # Another Compute Element has the lock pass return False def update_ts(self): if self.is_leader: self.zk.start() current_ts = self.gen_ts(datetime.datetime.utcnow()) self.zk.set(self.bdii_path, struct.pack('f', current_ts)) self.zk.stop()
def run(idx): zk_hosts = os.environ['RATER_ZK_HOSTS'] run_time = int(os.environ['RATER_RUN_TIME']) currency_name = init_rate_table[idx]['currency'] exchange_rate = init_rate_table[idx]['init_rate'] zk = KazooClient(hosts=zk_hosts) zk.start() minute_cnt = 0 while minute_cnt <= run_time: # Create a new rate table if not exists if not zk.exists('/rate_table/' + str(minute_cnt)): list_lock = zk.Lock('/rate_table/list_lock', 'list_lock') with list_lock: if not zk.exists('/rate_table/' + str(minute_cnt)): raw_rate_table = json.dumps({ 'RMB': -1, 'USD': -1, 'JPY': -1, 'EUR': -1 }) zk.create('/rate_table/' + str(minute_cnt), bytes(raw_rate_table), makepath=True) # Update rate table table_lock = zk.Lock('/rate_table/lock/' + str(minute_cnt), 'table_lock' + str(minute_cnt)) with table_lock: byte_rate_table, stat = zk.get('/rate_table/' + str(minute_cnt)) rate_table = json.loads(str(byte_rate_table)) rate_table[currency_name] = exchange_rate raw_rate_table = json.dumps(rate_table) zk.set('/rate_table/' + str(minute_cnt), bytes(raw_rate_table)) exchange_rate += 0.1 minute_cnt += 1 #time.sleep(60) zk.stop()
def run_with_lock(name): # 连接zookeeper zk = KazooClient(hosts='39.108.147.32:2182') # 启动连接 zk.start() # 创建锁 lock = zk.Lock("/lockpath", "my-identifier") while True: # 获取当前秒数,当秒数为5时同时访问秒杀函数 if arrow.now().second % 5 == 0: with lock: seckilling() return
def zk_lock(zk: KazooClient, lock_path: str, contender_id: str, timeout: int) -> Generator: """ This contextmanager takes a ZooKeeper lock, yields, then releases the lock. This lock behaves like an interprocess mutex lock. ZooKeeper allows one to read values without holding a lock, but there is no guarantee that you will read the latest value. To read the latest value, you must call `sync()` on a ZNode before calling `get()`. Args: zk: The client to use to communicate with ZooKeeper. lock_path: The ZNode path to use as prefix for the locking recipe. contender_id: The contender id to identify the current client in the locking recipe. timeout: Time in seconds to wait for the lock to be acquired. If this time elapses before the lock is acquired, a `kazoo.exceptions.LockTimeout` exception is raised. Raises: kazoo.exceptions.LockTimeout: If the `timeout` is exceeded without the lock being acquired. """ lock = zk.Lock(lock_path, contender_id) try: log.info("Acquiring ZooKeeper lock.") lock.acquire(blocking=True, timeout=timeout, ephemeral=True) except (ConnectionLoss, SessionExpiredError) as e: msg_fmt = "Failed to acquire lock: {}" msg = msg_fmt.format(e.__class__.__name__) log.exception(msg) raise e except LockTimeout as e: msg_fmt = "Failed to acquire lock in `{}` seconds" msg = msg_fmt.format(timeout) log.exception(msg) raise e else: log.info("ZooKeeper lock acquired.") try: yield finally: log.info("Releasing ZooKeeper lock") lock.release() log.info("ZooKeeper lock released.")
def main(loops=-1, loop_interval=60, restart_interval=30): """ :param loops: Number of loops. (set <0 for infinite) :param loop_interval: Time to sleep per loop (seconds). :param restart_interval: Time to sleep after a restart (seconds). :return: Return code for process. """ exhibitor = os.environ.get('EXHIBITOR_BASE') if not exhibitor: logger.error('Variable EXHIBITOR_BASE not found') return -1 base_properties = server_template() # Discover ZK and start server: zk_conn = zk_conn_string(exhibitor) kafka_pid = start_kafka(base_properties, zk_conn) # Loop: while loops != 0: # If Kafka has died, stop: kafka_pid.poll() if kafka_pid.returncode: logger.info('Kafka died: %s', kafka_pid.returncode) return kafka_pid.returncode # Poll Exhibitor for current ensemble: cur_zk = zk_conn_string(exhibitor) if cur_zk != zk_conn and len(cur_zk) >= len(zk_conn): logger.info('ZooKeeper ensemble change: %s', cur_zk) # If ensemble has changed, acquire lock: zk = KazooClient(hosts=','.join(cur_zk)) try: zk.start() with zk.Lock('/kafka-exhibitor/%s' % exhibitor): logger.info('Restart lock acquired, restarting...') # Lock acquired, restart: kafka_pid.terminate() kafka_pid.wait() kafka_pid = start_kafka(base_properties, cur_zk) zk_conn = cur_zk time.sleep(restart_interval) finally: zk.stop() # Loop: time.sleep(loop_interval) loops -= 1 return 0
def create_app_lock(): """Acquire a lock in zookeeper for creating a marathon app. This is due to marathon's extreme lack of resilience with creating multiple apps at once, so we use this to not do that and only deploy one app at a time.""" zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S) zk.start() lock = zk.Lock('%s/%s' % (ZK_LOCK_PATH, 'create_marathon_app_lock')) try: lock.acquire(timeout=30) # timeout=0 throws some other strange exception yield except LockTimeout: raise LockHeldException("Failed to acquire lock for creating marathon app!") finally: lock.release() zk.stop()
def thread_process2(index): logDS.info("begin %d" % index) zk = KazooClient(hosts='172.10.3.111:2181', logger=logDS.logger) zk.start() node = "/my/lockpath" lock = zk.Lock(node, "my-identifier") with lock: # blocks waiting for lock acquisition # do something with the lock logDS.info("get the lock %d" % index) global gindex if gindex != index: logDS.error("error xxxx %d %d" % (gindex, index)) gindex = gindex + 1 if index == 0: time.sleep(10) zk.stop() logDS.info("exit %d" % index)
def create_lock(self): zk = KazooClient(hosts=self.zookeeper_quorum) zk.start() with zk.Lock("/",get_current_host_name()): proc_object = ProcCheck(self.process_name, self.monitor_interval, self.start _command) proc_id = proc_object.get_process_id() if proc_id is None: proc_status = proc_object.start_process() if proc_status: proc_id = proc_object.get_process_id() # shall proc_status be False, proc_id will return None again, # which will be handled by ProcCheck.monitor() # as soon as monitor finishes, it will release lock, # allowing the service running on another machine to create lock. proc_object.monitor(proc_id)
def create_autoscaling_lock(): """Acquire a lock in zookeeper for autoscaling. This is to avoid autoscaling a service multiple times, and to avoid having multiple paasta services all attempting to autoscale and fetching mesos data.""" zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S) zk.start() lock = zk.Lock('/autoscaling/autoscaling.lock') try: lock.acquire(timeout=1) # timeout=0 throws some other strange exception yield except LockTimeout: raise LockHeldException("Failed to acquire lock for autoscaling!") else: lock.release() finally: zk.stop()
def bounce_lock_zookeeper(name): """Acquire a bounce lock in zookeeper for the name given. The name should generally be the service namespace being bounced. This is a contextmanager. Please use it via 'with bounce_lock(name):'. :param name: The lock name to acquire""" zk = KazooClient(hosts=load_system_paasta_config().get_zk_hosts(), timeout=ZK_LOCK_CONNECT_TIMEOUT_S) zk.start() lock = zk.Lock('%s/%s' % (ZK_LOCK_PATH, name)) try: lock.acquire(timeout=1) # timeout=0 throws some other strange exception yield except LockTimeout: raise LockHeldException("Service %s is already being bounced!" % name) else: lock.release() finally: zk.stop()
class Orc(object): def __init__(self, host, port, supervisor, orc_host): self.zk = KazooClient('{}:{}'.format(host, port)) self.path = PathMaker() self.name_gen = NameGenerator() self.name = None self.supervisor = supervisor self.orc_host = orc_host self.setup() def setup_nodes(self): # Setup ephemeral nodes lock = self.zk.Lock(self.path.namelock()) with lock: used_names = self.zk.get_children(self.path.toolchain()) new_name = self.name_gen.generate() while new_name in used_names: new_name = self.name_gen.generate() self.name = new_name # Register watch DataWatch(self.zk, self.path.toolchain(self.name), self.on_sync) # Setup path for conf synchronization self.zk.create(self.path.toolchain(new_name), ephemeral=True) # Put information about node self.zk.create(self.path.node(self.name), value=self.orc_host, ephemeral=True) def setup(self): logger.info('Setting up Orc') self.zk.start() # Setup nodes self.setup_nodes() def on_sync(self, data, stat, event): if event and event.type == EventType.CHANGED: logger.info('Synchronizing toolchain') self.supervisor.update(data) def teardown(self): logger.info('Tearing down Orc') self.zk.stop() self.zk.close() self.supervisor.teardown()
def zk_cluster_lock(zk: KazooClient, name: str, timeout: int = 30) -> Generator: lock = zk.Lock("{}/{}".format(ZK_PREFIX, name), socket.gethostname()) try: print("Acquiring cluster lock '{}'".format(name)) lock.acquire(blocking=True, timeout=timeout) except (ConnectionLoss, SessionExpiredError) as e: print("Failed to acquire cluster lock: {}".format( e.__class__.__name__)) raise e except LockTimeout as e: print("Failed to acquire cluster lock in {} seconds".format(timeout)) raise e else: print("ZooKeeper lock acquired.") yield print("Releasing ZooKeeper lock") lock.release() print("ZooKeeper lock released.")
def test_lock_timeout(): zk = KazooClient(hosts="127.0.0.1:2181") zk.start() lock = zk.Lock("/zha-lock", "test") lock.acquire() obj = type('', (), {}) obj.flg = False def _oa(): obj.flg = True return 0 config = skelton.Config() config.check_health = lambda: 3 config.become_active = _oa z = zha.ZHA(config) trigger_zha(z) assert obj.flg is False lock.release() zk.stop() time.sleep(10)
class Zookeeper(object): HOSTS = [ '127.0.0.1:2181' ] def __enter__(self): return self def __exit__(self, type, value, traceback): self._kill_subprocess() self.zk.stop() def __init__(self, app): self.app = app.strip().split() self.subprocess = None self.descendants = set() self.zk = KazooClient(hosts=self.HOSTS) self.zk.start() self.zk.ensure_path("/") self.lock = self.zk.Lock("/lock") self._set_z_watcher() def _start_subprocess(self): if not self.subprocess: self.subprocess = subprocess.Popen(args=self.app) def _kill_subprocess(self): if self.subprocess: self.subprocess.kill() self.subprocess = None def _set_children_watchers(self, path): if path not in self.descendants: self.descendants.add(path) if self.zk.exists(path): children = self.zk.get_children(path) for child in children: new_path = os.path.join(path, child) self._set_children_watchers(new_path) self._create_child_watcher(path) def _create_child_watcher(self, path): @self.zk.ChildrenWatch(path, send_event=True) def child_watcher(children, event): if event: path = event.path with self.lock: diff_list = list({os.path.join(path, child) for child in children} - self.descendants) if len(diff_list) > 0: for child in diff_list: self._set_children_watchers(child) with self.lock: print("Current descendants:", len(self.descendants) - 1) else: self._clean(path, children) def _clean(self, path, children): with self.lock: old_paths = {x for x in self.descendants if x.startswith(path) if x != path} new_paths = {os.path.join(path, child) for child in children} new_paths_extended = {descendant for descendant in self.descendants if any(descendant.startswith(new_path) for new_path in new_paths)} paths = old_paths - new_paths_extended for old_path in paths: self.descendants.remove(old_path) def _set_z_watcher(self): @self.zk.DataWatch('/z') def data_watcher(data, stat, event): if stat: self._start_subprocess() self._set_children_watchers("/z") else: self._kill_subprocess() self.descendants = set() def _print_tree(self): def _print_recursive(indent, node): print("│ " * indent, "├─", node, sep="")oc for child in self.zk.get_children(node): path = os.path.join(node, child) if self.zk.exists(path): _print_recursive(indent + 1, path) if not self.zk.exists("/z"): print("No node /z for printing tree") else: print("/z") for child in self.zk.get_children("z"): _print_recursive(0, os.path.join("/z", child)) def _print_usage(self): print("q - quit") print("t - print tree") def handle(self): while True: self._print_usage() x = input('Command: ') if x == "q": break elif x == "t": self._print_tree() else: print("Unknown command")
class DistributedSequenceCoordinator(object): def __init__(self, zookeeper_connect, autoscaling_grp_name, strategy_name, instance_id, max_sequence_id, asg_instances_ids): self.zk = KazooClient(hosts=zookeeper_connect) self.running = False self.interrupted = False self.autoscaling_grp_name = autoscaling_grp_name self.strategy_name = strategy_name self.instance_id = instance_id self.max_sequence_id = max_sequence_id self.asg_instances_ids = asg_instances_ids def state_change_listener(self, state): logging.debug('zookeeper state changed to {0}'.format(state)) if state == KazooState.LOST or state == KazooState.SUSPENDED: if self.running: self.interrupted = True self.log_msg('distributed coordination interrupted') raise Exception('zookeeper session interrupted') """ Responsible for executing operation in isolation even-in cases of failures, connection-resets etc. Uses optimistic concurrency control by assuming that operation would be executed without any interruption, and if any interruption occurs, then acquires a new lock and re-execute the idempotent operation to guarantee isolation. """ def execute(self): result = None # exception-handling for cases where unable to establish connection to zookeeper try: # TODO: use python retrying lib to control with timeouts, max & exponential back-off wait time b/w retries while result is None or self.interrupted: self.running = True self.interrupted = False self.log_msg('distributed operation starting') self.zk.start() self.zk.add_listener(self.state_change_listener) try: lock = self.zk.Lock(zk_sequencer_root, self.autoscaling_grp_name) logging.debug('zookeeper lock created {}'.format( lock.data)) self.log_msg('entering zookeeper lock') with lock: result = self.operation() except Exception as e: logging.exception(e) self.log_msg('encountered zk exception') finally: self.log_msg('stopping zk') self.zk.stop() except Exception as e: raise e if result is None: raise Exception('Unable to generate sequence id') return result def operation(self): instances_root_path = "/".join( [zk_sequencer_root, self.autoscaling_grp_name]) self.zk.ensure_path(instances_root_path) instance_nodes = self.zk.get_children(instances_root_path) zk_instance_sequencers = {} for instance_node in instance_nodes: instance_node_path = "/".join([instances_root_path, instance_node]) instance_id = self.zk.get(instance_node_path)[0] zk_instance_sequencers[str(instance_id)] = int(instance_node) logging.debug('zk instances: {0}'.format(zk_instance_sequencers)) instance_sequencers = { k: v for k, v in zk_instance_sequencers.items() if k in self.asg_instances_ids } logging.debug('active instances with assigned sequences: {0}'.format( instance_sequencers)) generator = SequenceStrategy(self.strategy_name, self.instance_id, instance_sequencers, self.max_sequence_id) sequence_id = generator.get_sequence_id() current_instance_node_path = "/".join( [instances_root_path, str(sequence_id)]) self.zk.ensure_path(current_instance_node_path) self.zk.set(current_instance_node_path, str.encode(str(self.instance_id))) self.running = False return sequence_id def log_msg(self, msg): logging.debug('{0}, running = {1}, interrupted = {2}'.format( msg, self.running, self.interrupted))
class Listener: def __init__(self, hosts, root, workspace='/tmp'): self.zk = KazooClient(hosts=hosts) self.root = root self.workspace = os.path.abspath(workspace) self.tasks = [] self.event = threading.Event() self.hostname = os.uname().nodename def get_task(self, task_id): node = os.path.join(self.root, 'tasks', task_id, 'targets', self.hostname) lock_node = os.path.join(node, 'lock') lock = self.zk.Lock(lock_node, self.hostname.encode()) with lock: data, _ = self.zk.get(node) return json.dumps(data.decode()) def set_status(self, task_id, status): node = os.path.join(self.root, 'tasks', task_id, 'targets', self.hostname) lock_node = os.path.join(node, 'lock') lock = self.zk.Lock(lock_node, self.hostname.encode()) with lock: self.zk.set(node, status.encode()) signal_node = os.path.join(self.root, 'signal', task_id) self.zk.set(signal_node, uuid.uuid4().bytes) def get_job_server_list(self): node = os.path.join(self.root, 'job_server') return [ self.zk.get(os.path.join(node, x))[0] for x in self.zk.get_children(node) ] def get_log_server_list(self): node = os.path.join(self.root, 'log_server') result = [] for server in self.zk.get_children(node): address, port = server.split(':') result.append((address, int(port))) return result def render(self, params): for root, _, files in os.walk('.'): for tmpl in [f for f in files if f.endswith('.tmpl')]: path = os.path.join(root, tmpl) with open(path, 'r') as f: content = f.read() rendered = pystache.render(content, params) with open(path.replace('.tmpl', ''), 'w') as w: w.write(rendered) def _send_log(self, task_id, cmd, seq=1): log_server = random.choice(self.get_log_server_list()) s = socket.socket() s.connect(log_server) s.send(task_id.encode()) s.send(b'\n') s.send(self.hostname.encode()) s.send(b'\n') s.send('{0}'.format(seq)) s.send(b'\n\n') for buf in cmd.out_stream(): s.send(buf) s.close() def send_log(self, task_id, cmd): seq = 1 while not cmd.finish: t = threading.Thread(target=self._send_log, args=(task_id, cmd, seq)) t.start() t.join() seq += 1 def schedule(self, task_id): task = self.get_task(task_id) job_server = random.choice(self.get_job_server_list()) # http://xxx.xxx.xx.xxx/packages/ # magedu/test-job # http://xxx.xxx.xx.xxx/packages/magedu/test-job.zip url = '{0}/{1}.zip'.format(job_server, task_id['job_id']) response = requests.get(url) z = zipfile.ZipFile(io.BytesIO(response.content)) workspace = os.path.join(self.workspace, task_id) os.makedirs(workspace) os.chdir(workspace) z.extractall() try: self.render(task.get('params', {})) except Exception as e: logging.error(e) self.set_status(task_id, 'F') return os.chmod('./run.sh', 0o755) cmd = Command('run.sh', workspace, timeout=task.get('timeout', 0)) self.set_status(task_id, 'R') cmd.exec() self.send_log(task_id, cmd) cmd.wait() if cmd.success: self.set_status(task_id, 'S') else: self.set_status(task_id, 'F') def run(self): while not self.event.is_set(): if len(self.tasks) > 0: task_id = self.tasks.pop(0) try: self.schedule(task_id) finally: shutil.rmtree(os.path.join(self.workspace, task_id)) else: self.event.wait(1) def watch(self, tasks): new_tasks = set(tasks).difference(self.tasks) self.tasks.extend(new_tasks) return not self.event.is_set() def start(self): self.zk.start() node = os.path.join(self.root, 'agents', self.hostname) self.zk.ensure_path(node) tasks_node = os.path.join(node, 'tasks') self.zk.ensure_path(tasks_node) self.zk.create(os.path.join(node, 'alive'), str(datetime.datetime.now().timestamp()).encode(), ephemeral=True) ChildrenWatch(self.zk, tasks_node, self.watch) threading.Thread(target=self.run, name='task-runner').start() def shutdown(self): self.event.set() def join(self): self.event.wait()
#! /usr/bin/env python2 import socket import os from kazoo.client import KazooClient from kazoo.client import KazooState def zk_status(state): if state == KazooState.LOST: print 'lost session' elif state == KazooState.SUSPENDED: print 'disconnected from ZK' elif state == KazooState.CONNECTED: print 'connected' # API 0.3 spec # http://kazoo.readthedocs.org/en/0.3/api/client.html zk = KazooClient(hosts='server1:2181,vmk1:2181,vmk2:2181') zk.add_listener(zk_status) zk.start() lock = zk.Lock('/master', '%s-%d' % (socket.gethostname(), os.getpid())) zk.ensure_path("/path") zk.set("/path", "data_string".encode('utf8')) start_key, stat = zk.get("/path")
def get_cluster_status(): try: return requests.get("{}admin/collections?action=CLUSTERSTATUS".format( SDAP_SOLR_URL)).json() except (requests.exceptions.ConnectionError, json.decoder.JSONDecodeError): return False logging.info("Attempting to aquire lock from {}".format(SDAP_ZK_SOLR)) zk_host, zk_chroot = SDAP_ZK_SOLR.split('/') zk = KazooClient(hosts=zk_host) zk.start() zk.ensure_path(zk_chroot) zk.chroot = zk_chroot lock = zk.Lock("/collection-creator", ZK_LOCK_GUID) try: with lock: # blocks waiting for lock acquisition logging.info( "Lock aquired. Checking for SolrCloud at {}".format(SDAP_SOLR_URL)) # Wait for MAX_RETRIES for the entire Solr cluster to be available. attempts = 0 status = None collection_exists = False while attempts <= MAX_RETRIES: status = get_cluster_status() if not status: # If we can't get the cluster status, my Solr node is not running attempts += 1 logging.info("Waiting for Solr at {}".format(SDAP_SOLR_URL)) time.sleep(1)
class Manager: def __init__(self): super(Manager, self).__init__() self._terminated = False self.config = config['zookeeper'] self.zk = KazooClient(**self.config) self.zk.start() self.zk.ensure_path('/jobs') self.zk.ensure_path('/settings/running') self.zk.set('/settings/running', 'false'.encode()) self.hadoop = HadoopModules() self.lock = self.zk.Lock('/settings/lock', 'lock') self._start_polling() def _start_polling(self): t = threading.Thread(target=self._try_execute_job, daemon=True) t.start() def _try_execute_job(self): if not self._terminated: threading.Timer(5.0, self._try_execute_job).start() self.execute_next_job() def enqueue_job(self, job): if job.id is None: return success = False with self.lock: node = "/jobs/{0}".format(job.id) if not self.zk.exists(node): self.zk.create(node) self.zk.create(node + "/jar_path", job.file_full_path().encode()) self.zk.create(node + "/retries", '0'.encode()) success = True log.info("Enqueued job {0}".format(job.id)) return success def execute_next_job(self): with self.lock: if self._is_running(): return children = self.zk.get_children('/jobs') if not children: return children = map(lambda s: int(s), children) next_job_id = min(children) if self._check_retries(next_job_id): return self.execute_job(next_job_id, False) else: Jobs.update_entity(next_job_id, status=Jobs.FAILED) self._delete_job(next_job_id) log.info( "Removing job {0} after 3 failures".format(next_job_id)) self.execute_next_job() def _delete_job(self, id): self.zk.delete("/jobs/{0}".format(id), recursive=True) def _check_retries(self, id): retries, _ = self.zk.get("/jobs/{0}/retries".format(id)) return int(retries.decode()) < 3 def execute_job(self, id, take_lock=True): if take_lock: with self.lock: self.execute_job_no_lock(id) else: self.execute_job_no_lock(id) def hadoop_callback(self, job_id, return_code, stdout, stderr): job = Jobs.find(job_id) if job: job.update(stdout=stdout, stderr=stderr) with self.lock: self._set_running(False) if return_code == 0: job.update(status=Jobs.FINISHED) self._delete_job(job_id) def execute_job_no_lock(self, id): log.info("Executing job {0}".format(id)) self._set_running(True) self._increase_retries(id) job = Jobs.find(id) job.update_entity(id, status=Jobs.RUNNING) path, _ = self.zk.get("/jobs/{0}/jar_path".format(id)) callback = lambda ret, out, err: self.hadoop_callback( id, ret, out, err) self.hadoop.start_hadoop(path.decode(), job.arguments_list(), callback) def _set_running(self, is_running): value = ('true' if is_running else 'false').encode() self.zk.set('/settings/running', value) def _is_running(self): v, _ = self.zk.get('/settings/running') return v.decode() == 'true' def _increase_retries(self, id): key = "/jobs/{0}/retries".format(id) retries, _ = self.zk.get(key) new_retries = int(retries.decode()) + 1 Jobs.update_entity(id, retries=new_retries) self.zk.set(key, str(new_retries).encode())
class Zookeeper(object): """ A wrapper class for Zookeeper interfacing, using the `Kazoo python library <https://kazoo.readthedocs.org/en/latest/index.html>`_. As Kazoo's functionality is mostly unaware of connection-state changes, it requires quite a bit of delicate code to make work reliably. E.g. Kazoo's Lock will claim to be held, even if the Zookeeper connection has been lost in the meantime. This causes an immediate split-brain problem for anything relying on that lock for synchronization. There is also, unfortunately, no documented way to inform the local Lock object that the connection is down and therefore the Lock should be released. All of Kazoo's events are done via callbacks. These callbacks must not block. If they do, no more Kazoo events can happen. E.g. if a watch callback blocks, disconnection callbacks will not run. """ def __init__(self, isd_as, srv_type, srv_id, zk_hosts, timeout=1.0, on_connect=None, on_disconnect=None): """ Setup the Zookeeper connection. :param ISD_AS isd_as: The local ISD-AS. :param str srv_type: a service type from :const:`lib.types.ServiceType` :param str srv_id: Service instance identifier. :param list zk_hosts: List of Zookeeper instances to connect to, in the form of ``["host:port"..]``. :param float timeout: Zookeeper session timeout length (in seconds). :param on_connect: A function called everytime a connection is made to Zookeeper. :param on_disconnect: A function called everytime a connection is lost to Zookeeper. """ self._isd_as = isd_as self._srv_id = b64encode(srv_id).decode("ascii") self._timeout = timeout self._on_connect = on_connect self._on_disconnect = on_disconnect self.prefix = "/%s/%s" % (self._isd_as, srv_type) # Keep track of our connection state self._connected = threading.Event() # Keep track of the kazoo lock self._lock = threading.Event() # Used to signal connection state changes self._state_events = queue.Queue() self.conn_epoch = 0 # Kazoo parties self._parties = {} # Kazoo lock (initialised later) self._zk_lock = None self._lock_epoch = 0 self._kazoo_setup(zk_hosts) self._setup_state_listener() self._kazoo_start() def _kazoo_setup(self, zk_hosts): """ Create and configure Kazoo client :param list zk_hosts: List of Zookeeper instances to connect to, in the form of ``["host:port"..]``. """ # Disable exponential back-off kretry = KazooRetry(max_tries=-1, max_delay=1) # Stop kazoo from drowning the log with debug spam: logger = logging.getLogger("KazooClient") logger.setLevel(logging.ERROR) # (For low-level kazoo debugging): # import kazoo.loggingsupport # logger.setLevel(kazoo.loggingsupport.BLATHER) self.kazoo = KazooClient(hosts=",".join(zk_hosts), timeout=self._timeout, connection_retry=kretry, logger=logger) def _kazoo_start(self): """Connect the Kazoo client to Zookeeper.""" logging.info("Connecting to Zookeeper") try: self.kazoo.start() except KazooTimeoutError: logging.critical( "Timed out connecting to Zookeeper on startup, exiting") kill_self() def _setup_state_listener(self): """ Spawn state listener thread, to respond to state change notifications from Kazoo. We use a thread, as the listener callback must not block. """ threading.Thread(target=thread_safety_net, args=(self._state_handler, ), name="libZK._state_handler", daemon=True).start() # Listener called every time connection state changes self.kazoo.add_listener(self._state_listener) def _state_listener(self, new_state): """Called everytime the Kazoo connection state changes.""" self.conn_epoch += 1 # Signal a connection state change logging.debug("Kazoo state changed to %s (epoch %d)", new_state, self.conn_epoch) self._state_events.put(new_state) # Tell kazoo not to remove this listener: return False def _state_handler(self, initial_state="startup"): """ A thread worker function to wait for Kazoo connection state changes, and call the relevant method. """ old_state = initial_state while True: # Wait for connection state change new_state = self._state_events.get() if (new_state == KazooState.CONNECTED and not self._state_events.empty()): # Helps prevent some state flapping. logging.debug("Kazoo CONNECTED ignored as the events " "queue is not empty.") continue # Short-circuit handler if the state hasn't actually changed. This # prooobably shouldn't happen now, so making it an error. if new_state == old_state: logging.error("Kazoo state didn't change from %s, ignoring", old_state) continue logging.debug("Kazoo old state: %s, new state: %s", old_state, new_state) old_state = new_state if new_state == KazooState.CONNECTED: self._state_connected() elif new_state == KazooState.SUSPENDED: self._state_suspended() else: self._state_lost() def _state_connected(self): """Handles the Kazoo 'connected' event.""" # Might be first connection, or reconnecting after a problem. clid = self.kazoo.client_id if clid is None: # Protect against a race-condition. return try: zk_peer = self.kazoo._connection._socket.getpeername() except AttributeError: zk_peer = "?", "?" logging.debug( "Connection to Zookeeper succeeded (Session: %s, ZK: [%s]:%s)", hex(clid[0]), zk_peer[0], zk_peer[1]) try: self.ensure_path(self.prefix, abs=True) # Use a copy of the dictionary values, as the dictioary is changed # by another thread. for party in list(self._parties.values()): party.autojoin() except ZkNoConnection: return self._connected.set() if self._on_connect: self._on_connect() def _state_suspended(self): """ Handles the Kazoo 'connection suspended' event. This means that the connection to Zookeeper is down. """ self._connected.clear() logging.info("Connection to Zookeeper suspended") if self._on_disconnect: self._on_disconnect() def _state_lost(self): """ Handles the Kazoo 'connection lost' event. This means that the Zookeeper session is lost, so all setup needs to be re-done on connect. """ self._connected.clear() logging.info("Connection to Zookeeper lost") if self._on_disconnect: self._on_disconnect() def is_connected(self): """Check if there is currently a connection to Zookeeper.""" return self._connected.is_set() def wait_connected(self, timeout=None): """ Wait until there is a connection to Zookeeper. Log every 10s until a connection is available. :param float timeout: Number of seconds to wait for a ZK connection. If ``None``, wait forever. :raises: ZkNoConnection: if there's no connection to ZK after timeout has expired. """ if self.is_connected(): return logging.debug("Waiting for ZK connection") start = time.time() total_time = 0.0 if timeout is None: next_timeout = 10.0 while True: if timeout is not None: next_timeout = min(timeout - total_time, 10.0) ret = self._connected.wait(timeout=next_timeout) total_time = time.time() - start if ret: logging.debug("ZK connection available after %.2fs", total_time) return elif timeout is not None and total_time >= timeout: logging.debug("ZK connection still unavailable after %.2fs", total_time) raise ZkNoConnection else: logging.debug("Still waiting for ZK connection (%.2fs so far)", total_time) def ensure_path(self, path, abs=False): """ Ensure that a path exists in Zookeeper. :param str path: Path to ensure :param bool abs: Is the path abolute or relative? :raises: ZkNoConnection: if there's no connection to ZK. """ full_path = path if not abs: full_path = os.path.join(self.prefix, path) try: self.kazoo.ensure_path(full_path) except (ConnectionLoss, SessionExpiredError): raise ZkNoConnection from None def party_setup(self, prefix=None, autojoin=True): """ Setup a `Kazoo Party <https://kazoo.readthedocs.org/en/latest/api/recipe/party.html>`_. Used to signal that a group of processes are in a similar state. :param str prefix: Path to create the party under. If not specified, uses the default prefix for this server instance. :param bool autojoin: Join the party if True, also on reconnect :return: a ZkParty object :rtype: ZkParty :raises: ZkNoConnection: if there's no connection to ZK. """ if not self.is_connected(): raise ZkNoConnection if prefix is None: prefix = self.prefix party_path = os.path.join(prefix, "party") self.ensure_path(party_path, abs=True) party = ZkParty(self.kazoo, party_path, self._srv_id, autojoin) self._parties[party_path] = party return party def get_lock(self, lock_timeout=None, conn_timeout=None): """ Try to get the lock. Returns immediately if we already have the lock. :param float lock_timeout: Time (in seconds) to wait for lock acquisition, or ``None`` to wait forever (Default). :param float conn_timeout: Time (in seconds) to wait for a connection to ZK, or ``None`` to wait forever (Default). :return: ``ZK_LOCK_FAIL`` if getting the lock failed, ``ZK_LOCK_SUCCESS`` if the lock was acquired, or ``ZK_LOCK_ALREADY`` if the lock is already held by this process. :rtype: :class:`int` """ if self._zk_lock is None: # First-time setup. lock_path = os.path.join(self.prefix, "lock") self._zk_lock = self.kazoo.Lock(lock_path, self._srv_id) elif self.have_lock(): return ZK_LOCK_ALREADY self.wait_connected(timeout=conn_timeout) self._lock_epoch = self.conn_epoch if lock_timeout is None: # Only need to log this when we could block for a long time logging.debug("Trying to acquire ZK lock (epoch %d)", self._lock_epoch) try: if self._zk_lock.acquire(timeout=lock_timeout): logging.info("Successfully acquired ZK lock (epoch %d)", self._lock_epoch) self._lock.set() except (ConnectionLoss, SessionExpiredError): raise ZkNoConnection from None except LockTimeout: pass except (AttributeError, TypeError): # Work-around for https://github.com/python-zk/kazoo/issues/288 pass if self.have_lock(): return ZK_LOCK_SUCCESS return ZK_LOCK_FAIL def release_lock(self): """Release the lock.""" self._lock.clear() if self._zk_lock is None: return if self.is_connected(): try: self._zk_lock.release() except (NoNodeError, ConnectionLoss, SessionExpiredError): pass # Hack suggested by https://github.com/python-zk/kazoo/issues/2 self._zk_lock.is_acquired = False def have_lock(self): """Check if we currently hold the lock.""" if (self.is_connected() and self._lock_epoch == self.conn_epoch and self._lock.is_set()): return True else: self.release_lock() return False def wait_lock(self): """Wait until we hold the lock.""" self._lock.wait() def get_lock_holder(self): """ Return address and port of the lock holder, or None if master is not elected. :raises: ZkNoConnection: if there's no connection to ZK. """ if self._zk_lock is None: return None try: contenders = self._zk_lock.contenders() if not contenders: logging.warning('No lock contenders found') return None return ZkID.from_raw(b64decode(contenders[0])) except (ConnectionLoss, SessionExpiredError): logging.warning("Disconnected from ZK.") raise ZkNoConnection from None def retry(self, desc, f, *args, _retries=4, _timeout=10.0, **kwargs): """ Execute a given operation, retrying it if fails due to connection problems. :param str desc: Description of the operation :param function f: Function to call, passing in \*args and \*\*kwargs :param int _retries: Number of times to retry the operation, or `None` to retry indefinitely. :param float _timeout: Number of seconds to wait for a connection, or `None` to wait indefinitely. """ count = -1 while True: count += 1 if _retries is not None and count > _retries: break try: self.wait_connected(timeout=_timeout) except ZkNoConnection: logging.warning("%s: No connection to ZK", desc) continue try: return f(*args, **kwargs) except ZkNoConnection: logging.warning("%s: Connection to ZK dropped", desc) raise ZkRetryLimit("%s: Failed %s times, giving up" % (desc, 1 + _retries))
class PartitionClient(object): """ Client Class for the Partition Library Example usage: --------------------- import libpartition from libpartition.libpartition import PartitionClient def own_change_cb(l): print "ownership change:" + str(l) c = PartitionClient("test", "s1", ["s1", "s2", "s3"], 32, own_change_cb, "zookeeper_s1") ##do some real work now" if (c.own_partition(1)): ...... do something with partition #1 ..... ......... ... c.update_cluster_list(["s1", "s2"]) ... ---------------------- You should not call any partition library routine from within the callback function Args: app_name(str): Name of the app for which partition cluster is used self_name(str): Name of the local cluster node (can be ip address) cluster_list(list): List of all the nodes in the cluster including local node max_partition(int): Partition space always go from 0..max_partition-1 partition_update_cb: Callback function invoked when partition ownership list is updated.x zk_server(str): <zookeeper server>:<zookeeper server port> """ def __init__(self, app_name, self_name, cluster_list, max_partition, partition_update_cb, zk_server, logger=None): # Initialize local variables self._zk_server = zk_server self._cluster_list = set(cluster_list) self._max_partition = max_partition self._update_cb = partition_update_cb self._curr_part_ownership_list = [] self._target_part_ownership_list = [] self._con_hash = ConsistentHash(cluster_list) self._name = self_name # some sanity check if not (self._name in cluster_list): raise ValueError('cluster list is missing local server name') # initialize logging and other stuff if logger is None: logging.basicConfig() self._logger = logging else: self._logger = logger self._conn_state = None self._sandesh_connection_info_update(status='INIT', message='') # connect to zookeeper while True: self._logger.error("Libpartition zk start") self._zk = KazooClient(zk_server, timeout=60.0) self._zk.add_listener(self._zk_listen) try: self._zk.start() while self._conn_state != ConnectionStatus.UP: gevent.sleep(1) break except Exception as e: # Update connection info self._sandesh_connection_info_update(status='DOWN', message=str(e)) self._zk.remove_listener(self._zk_listen) try: self._zk.stop() self._zk.close() except Exception as ex: template = "Exception {0} in Libpartition zk stop/close. Args:\n{1!r}" messag = template.format(type(ex).__name__, ex.args) self._logger.error("%s : traceback %s for %s" % \ (messag, traceback.format_exc(), self._name)) finally: self._zk = None gevent.sleep(1) # create a lock array to contain locks for each partition self._part_locks = [] for part in range(0, self._max_partition): lockpath = "/lockpath/" + app_name + "/" + str(part) l = self._zk.Lock(lockpath, self._name) self._part_locks.append(l) # initialize partition # to lock acquire greenlet dictionary self._part_lock_task_dict = {} self._logger.error("initial servers:" + str(self._cluster_list)) # update target partition ownership list for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end __init__ def _sandesh_connection_info_update(self, status, message): new_conn_state = getattr(ConnectionStatus, status) ConnectionState.update(conn_type=ConnectionType.ZOOKEEPER, name='Zookeeper', status=new_conn_state, message=message, server_addrs=self._zk_server.split(',')) if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and new_conn_state == ConnectionStatus.DOWN): msg = 'Connection to Zookeeper down: %s' % (message) self._logger.error(msg) if (self._conn_state and self._conn_state != new_conn_state and new_conn_state == ConnectionStatus.UP): msg = 'Connection to Zookeeper ESTABLISHED' self._logger.error(msg) self._conn_state = new_conn_state # end _sandesh_connection_info_update def _zk_listen(self, state): self._logger.error("Libpartition listen %s" % str(state)) if state == KazooState.CONNECTED: # Update connection info self._sandesh_connection_info_update(status='UP', message='') elif state == KazooState.LOST: self._logger.error("Libpartition connection LOST") # Lost the session with ZooKeeper Server # Best of option we have is to exit the process and restart all # over again self._sandesh_connection_info_update( status='DOWN', message='Connection to Zookeeper lost') os._exit(2) elif state == KazooState.SUSPENDED: self._logger.error("Libpartition connection SUSPENDED") # Update connection info self._sandesh_connection_info_update( status='INIT', message='Connection to zookeeper lost. Retrying') # following routine is the greenlet task function to acquire the lock # for a partition def _acquire_lock(self, part): # lock for the partition l = self._part_locks[part] # go in an infinite loop waiting to acquire the lock try: while True: ret = l.acquire(blocking=False) if ret == True: self._logger.error("Acquired lock for:" + str(part)) self._curr_part_ownership_list.append(part) self._update_cb(self._curr_part_ownership_list) return True else: gevent.sleep(1) except CancelledError: self._logger.error("Lock acquire cancelled for:" + str(part)) return False except Exception as ex: # TODO: If we have a non-KazooException, the lock object # may get stuck in the "cancelled" state self._logger.error("Lock acquire unexpected error!: " + str(ex)) # This exception should get propogated to main thread raise SystemExit(1) return False #end _acquire_lock # get rid of finished spawned tasks from datastructures def _cleanup_greenlets(self): for part in list(self._part_lock_task_dict.keys()): if (self._part_lock_task_dict[part].ready()): del self._part_lock_task_dict[part] #end _cleanup_greenlets # following routine launches tasks to acquire partition locks def _acquire_partition_ownership(self): # cleanup any finished greenlets self._cleanup_greenlets() # this variable will help us decide if we need to call callback updated_curr_ownership = False # list of partitions for which locks have to be released release_lock_list = [] self._logger.info("known servers: %s" % self._con_hash.get_all_nodes()) for part in range(0, self._max_partition): if (part in self._target_part_ownership_list): if (part in self._curr_part_ownership_list): # do nothing, I already have ownership of this partition self._logger.info("No need to acquire ownership of:" + str(part)) else: # I need to acquire lock for this partition before I own if (part in list(self._part_lock_task_dict.keys())): try: self._part_lock_task_dict[part].get(block=False) except: # do nothing there is already a greenlet running to # acquire the lock self._logger.error("Already a greenlet running to" " acquire:" + str(part)) continue # Greenlet died without getting ownership. Cleanup self._logger.error("Cleanup stale greenlet running to" " acquire:" + str(part)) del self._part_lock_task_dict[part] self._logger.error("Starting greenlet running to" " acquire:" + str(part)) # launch the greenlet to acquire the loc, k g = Greenlet.spawn(self._acquire_lock, part) self._part_lock_task_dict[part] = g else: # give up ownership of the partition # cancel any lock acquisition which is ongoing if (part in list(self._part_lock_task_dict.keys())): try: self._part_lock_task_dict[part].get(block=False) except: self._logger.error( "canceling lock acquisition going on \ for:" + str(part)) # Cancelling the lock should result in killing the gevent self._part_locks[part].cancel() self._part_lock_task_dict[part].get(block=True) del self._part_lock_task_dict[part] if (part in self._curr_part_ownership_list): release_lock_list.append(part) self._curr_part_ownership_list.remove(part) updated_curr_ownership = True self._logger.error("giving up ownership of:" + str(part)) if (updated_curr_ownership is True): # current partition membership was updated call the callback self._update_cb(self._curr_part_ownership_list) if (len(release_lock_list) != 0): # release locks which were acquired for part in release_lock_list: self._logger.error("release the lock which was acquired:" + \ str(part)) try: self._part_locks[part].release() self._logger.error("fully gave up ownership of:" + str(part)) except: pass #end _acquire_partition_ownership def update_cluster_list(self, cluster_list): """ Updates the cluster node list Args: cluster_list(list): New list of names of the nodes in the cluster Returns: None """ # some sanity check if not (self._name in cluster_list): raise ValueError('cluster list is missing local server name') new_cluster_list = set(cluster_list) new_servers = list(new_cluster_list.difference(self._cluster_list)) deleted_servers = list( set(self._cluster_list).difference(new_cluster_list)) self._cluster_list = set(cluster_list) # update the hash structure if new_servers: self._logger.error("new servers:" + str(new_servers)) self._con_hash.add_nodes(new_servers) if deleted_servers: self._logger.error("deleted servers:" + str(deleted_servers)) self._con_hash.del_nodes(deleted_servers) # update target partition ownership list self._target_part_ownership_list = [] for part in range(0, self._max_partition): if (self._con_hash.get_node(str(part)) == self._name): if not (part in self._target_part_ownership_list): self._target_part_ownership_list.append(part) # update current ownership list self._acquire_partition_ownership() #end update_cluster_list def own_partition(self, part_no): """ Returns ownership information of a partition Args: part_no(int) : Partition no Returns: True if partition is owned by the local node False if partition is not owned by the local node """ return part_no in self._curr_part_ownership_list #end own_partition def close(self): """ Closes any connections and frees up any data structures Args: Returns: None """ # clean up greenlets for part in list(self._part_lock_task_dict.keys()): try: self._logger.error("libpartition greenlet cleanup %s" % str(part)) self._part_lock_task_dict[part].kill() except: pass self._zk.remove_listener(self._zk_listen) gevent.sleep(1) self._logger.error("Stopping libpartition") # close zookeeper try: self._zk.stop() except: self._logger.error("Stopping libpartition failed") else: self._logger.error("Stopping libpartition successful") self._logger.error("Closing libpartition") try: self._zk.close() except: self._logger.error("Closing libpartition failed") else: self._logger.error("Closing libpartition successful")
elif state == KazooState.SUSPENDED: print >>stderr, 'Connection to Zookeeper lost... Retrying...' else: print >>stderr, 'Connected.' zk.start() base_zk_path = '%s/%s' % (service_ns, service_id) def resolve_path(path): rel_path = relpath(path, config_dir) return base_zk_path if rel_path == '.' else join(base_zk_path, rel_path) if exists(config_dir) and isdir(config_dir): print >>stderr, 'Acquiring access lock...' with zk.Lock(base_zk_path + '.lock', node_id): for dirname, dirs, files in os.walk(config_dir): zk.ensure_path(resolve_path(dirname)) print >>stderr, ' Directory zk://' + resolve_path(dirname) for filename in files: filename = join(dirname, filename) config_path = resolve_path(filename) value = open(filename, 'rb').read() if zk.exists(config_path): print >>stderr, ' Updating zk://%s from %s [%d bytes]' % (config_path, filename, len(value)) zk.retry(zk.set, config_path, value) else: print >>stderr, ' Creating zk://%s from %s [%d bytes]' % (config_path, filename, len(value)) zk.retry(zk.create, config_path, value) else: print >>stderr, 'Invalid configuration directory'
class Lock(object): """ 分布式锁模块 """ def __init__(self, name): """ 初始化方法 :param str name: 分布式锁名字 :return: None :rtype: None :raises kazoo.interfaces.IHandler.timeout_exception: 连接超时异常 """ self._lock_name = name self._lock_node_path = config.GuardianConfig.get_persistent_path( "lock") self._lock_node = self._lock_node_path + '/' + self._lock_name self._lock_handle = None hosts = config.GuardianConfig.get(config.STATE_SERVICE_HOSTS_NAME) self._zkc = KazooClient(hosts=hosts) self._zkc.start() def create(self): """ 创建分布式锁 :return: 分布式锁句柄 :rtype: Kazoo lock """ if not self._lock_handle: self._lock_handle = self._zkc.Lock(self._lock_node) return self._lock_handle def delete(self): """ 删除分布式锁 :return: None :rtype: None :raises kazoo.exceptions.NoNodeError: 锁不存在 :raises kazoo.exceptions.NotEmptyError: 锁被占用 :raises kazoo.exceptions.ZookeeperError: Zookeeper连接异常 """ if not self._lock_handle: self._zkc.delete(self._lock_node) self._lock_handle = None def obtain(self): """ 获得锁,调用该接口后会一直阻塞,直到获得锁 :return: None :rtype: None """ self._lock_handle.acquire() def obtain_wait(self, timeout): """ 获得锁,调用该接口后,如果在timeout秒内得锁便成功返回,否则抛出异常 :param int timeout: 争锁超时时间 :return: 无返回 :rtype: None :raises kazoo.exceptions.LockTimeout: 得锁超时 """ self._lock_handle.acquire(timeout=timeout) def release(self): """ 释放锁 :return: 无返回 :rtype: None """ self._lock_handle.release() def retain(self): """ 锁重入,暂未实现 """ pass
# coding=utf-8 """ 分布式锁 """ from kazoo.client import KazooClient import time from uuid import uuid4 my_id = uuid4() zk = KazooClient(hosts="127.0.0.1:2181") zk.start() lock = zk.Lock("/test/lock", my_id) def work(): print "i am %s" % uuid4 while True: with lock: work() time.sleep(3) zk.stop()
def main(): """ Starts the AdminServer. """ logging.basicConfig(format=LOG_FORMAT, level=logging.INFO) parser = argparse.ArgumentParser( prog='appscale-admin', description='Manages AppScale-related processes') subparsers = parser.add_subparsers(dest='command') subparsers.required = True serve_parser = subparsers.add_parser( 'serve', description='Starts the server that manages AppScale processes') serve_parser.add_argument( '-p', '--port', type=int, default=constants.DEFAULT_PORT, help='The port to listen on') serve_parser.add_argument( '-v', '--verbose', action='store_true', help='Output debug-level logging') subparsers.add_parser( 'summary', description='Lists AppScale processes running on this machine') restart_parser = subparsers.add_parser( 'restart', description='Restart AppScale processes running on this machine') restart_parser.add_argument('service', nargs='+', help='The process or service ID to restart') args = parser.parse_args() if args.command == 'summary': table = sorted(list(get_combined_services().items())) print(tabulate(table, headers=['Service', 'State'])) sys.exit(0) if args.command == 'restart': socket_path = urlquote(ServiceManagerHandler.SOCKET_PATH, safe='') session = requests_unixsocket.Session() response = session.post( 'http+unix://{}/'.format(socket_path), data={'command': 'restart', 'arg': [args.service]}) response.raise_for_status() return if args.verbose: logger.setLevel(logging.DEBUG) options.define('secret', appscale_info.get_secret()) options.define('login_ip', appscale_info.get_login_ip()) options.define('private_ip', appscale_info.get_private_ip()) options.define('load_balancers', appscale_info.get_load_balancer_ips()) acc = appscale_info.get_appcontroller_client() ua_client = UAClient(appscale_info.get_db_master_ip(), options.secret) zk_client = KazooClient( hosts=','.join(appscale_info.get_zk_node_ips()), connection_retry=ZK_PERSISTENT_RECONNECTS) zk_client.start() version_update_lock = zk_client.Lock(constants.VERSION_UPDATE_LOCK_NODE) thread_pool = ThreadPoolExecutor(4) monit_operator = MonitOperator() all_resources = { 'acc': acc, 'ua_client': ua_client, 'zk_client': zk_client, 'version_update_lock': version_update_lock, 'thread_pool': thread_pool } if options.private_ip in appscale_info.get_taskqueue_nodes(): logger.info('Starting push worker manager') GlobalPushWorkerManager(zk_client, monit_operator) service_manager = ServiceManager(zk_client) service_manager.start() app = web.Application([ ('/oauth/token', OAuthHandler, {'ua_client': ua_client}), ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions', VersionsHandler, {'ua_client': ua_client, 'zk_client': zk_client, 'version_update_lock': version_update_lock, 'thread_pool': thread_pool}), ('/v1/projects', ProjectsHandler, all_resources), ('/v1/projects/([a-z0-9-]+)', ProjectHandler, all_resources), ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)', ServiceHandler, all_resources), ('/v1/apps/([a-z0-9-]+)/services/([a-z0-9-]+)/versions/([a-z0-9-]+)', VersionHandler, all_resources), ('/v1/apps/([a-z0-9-]+)/operations/([a-z0-9-]+)', OperationsHandler, {'ua_client': ua_client}), ('/api/cron/update', UpdateCronHandler, {'acc': acc, 'zk_client': zk_client, 'ua_client': ua_client}), ('/api/datastore/index/add', UpdateIndexesHandler, {'zk_client': zk_client, 'ua_client': ua_client}), ('/api/queue/update', UpdateQueuesHandler, {'zk_client': zk_client, 'ua_client': ua_client}) ]) logger.info('Starting AdminServer') app.listen(args.port) management_app = web.Application([ ('/', ServiceManagerHandler, {'service_manager': service_manager})]) management_server = HTTPServer(management_app) management_socket = bind_unix_socket(ServiceManagerHandler.SOCKET_PATH) management_server.add_socket(management_socket) io_loop = IOLoop.current() io_loop.start()
class Coordinator(object): def __init__(self, zk_hosts, hostname, port, join_cluster): self.me = '%s:%s' % (hostname, port) self.is_master = None self.slaves = cycle([]) self.slave_count = 0 self.started_shutdown = False if join_cluster: read_only = False else: read_only = True self.zk = KazooClient(hosts=zk_hosts, handler=SequentialGeventHandler(), read_only=read_only) event = self.zk.start_async() event.wait(timeout=5) self.lock = self.zk.Lock(path='/iris/sender_master', identifier=self.me) # Used to keep track of slaves / senders present in cluster self.party = Party(client=self.zk, path='/iris/sender_nodes', identifier=self.me) if join_cluster: self.zk.add_listener(self.event_listener) self.party.join() def am_i_master(self): return self.is_master # Used for API to get the current master def get_current_master(self): try: contenders = self.lock.contenders() except kazoo.exceptions.KazooException: logger.exception('Failed getting contenders') return None if contenders: return self.address_to_tuple(contenders[0]) else: return None # Used for API to get the current slaves if master can't be reached def get_current_slaves(self): return [self.address_to_tuple(host) for host in self.party] def address_to_tuple(self, address): try: host, port = address.split(':') return host, int(port) except (IndexError, ValueError): logger.error('Failed getting address tuple from %s', address) return None def update_status(self): if self.started_shutdown: return if self.zk.state == KazooState.CONNECTED: if self.lock.is_acquired: self.is_master = True else: try: self.is_master = self.lock.acquire(blocking=False, timeout=2) # This one is expected when we're recovering from ZK being down except kazoo.exceptions.CancelledError: self.is_master = False except kazoo.exceptions.LockTimeout: self.is_master = False logger.exception( 'Failed trying to acquire lock (shouldn\'t happen as we\'re using nonblocking locks)' ) except kazoo.exceptions.KazooException: self.is_master = False logger.exception( 'ZK problem while Failed trying to acquire lock') else: logger.error('ZK connection is in %s state', self.zk.state) self.is_master = False if self.zk.state == KazooState.CONNECTED: if self.is_master: slaves = [ self.address_to_tuple(host) for host in self.party if host != self.me ] self.slave_count = len(slaves) self.slaves = cycle(slaves) else: self.slaves = cycle([]) self.slave_count = 0 # Keep us as part of the party, so the current master sees us as a slave if not self.party.participating: try: self.party.join() except kazoo.exceptions.KazooException: logger.exception('ZK problem while trying to join party') else: self.slaves = cycle([]) self.slave_count = 0 def update_forever(self): while True: if self.started_shutdown: return old_status = self.is_master self.update_status() new_status = self.is_master if old_status != new_status: log = logger.info else: log = logger.debug if self.is_master: log('I am the master sender') else: log('I am a slave sender') metrics.set('slave_instance_count', self.slave_count) metrics.set('is_master_sender', int(self.is_master is True)) sleep(UPDATE_FREQUENCY) def leave_cluster(self): self.started_shutdown = True # cancel any attempts to acquire master lock which could make us hang self.lock.cancel() if self.zk.state == KazooState.CONNECTED: if self.party and self.party.participating: logger.info('Leaving party') self.party.leave() if self.lock and self.lock.is_acquired: logger.info('Releasing lock') self.lock.release() def event_listener(self, state): if state == KazooState.LOST or state == KazooState.SUSPENDED: logger.info( 'ZK state transitioned to %s. Resetting master status.', state) # cancel pending attempts to acquire lock which will break and leave # us in bad state self.lock.cancel() # make us try to re-acquire lock during next iteration when we're connected if self.lock.is_acquired: self.lock.is_acquired = False # make us try to rejoin the party during next iteration when we're connected if self.party.participating: self.party.participating = False # in the meantime we're not master self.is_master = None