def wait(master_count_filename): if not master_count_filename.exists(): # this is an agent log.info("master_count file doesn't exist, not waiting") return cluster_size = int(utils.read_file_text(master_count_filename)) log.info('Expected cluster size: {}'.format(cluster_size)) try: zk_mode = get_zookeeper_mode() except ConnectionRefusedError: log.error('ZooKeeper not running') sys.exit(1) if cluster_size == 1: desired_modes = {ZK_MODE_STANDALONE} else: desired_modes = {ZK_MODE_FOLLOWER, ZK_MODE_LEADER} if zk_mode not in desired_modes: log.error('ZooKeeper not in correct mode: %s', zk_mode) sys.exit(1) log.info('ZooKeeper OK: %s', zk_mode) # Check Exhibitor, but do not fail if it shows unexpected results try: response = requests.get(EXHIBITOR_STATUS_URL) except requests.exceptions.ConnectionError as ex: log.error('Could not connect to exhibitor: {}'.format(ex)) return if response.status_code != 200: log.error('Could not get exhibitor status: {}, Status code: {}'.format( EXHIBITOR_STATUS_URL, response.status_code)) return try: data = response.json() except ValueError: log.error('Non-JSON returned by Exhibitor: %r', response.content) return serving = [] leaders = [] for node in data: if node['isLeader']: leaders.append(node['hostname']) if node['description'] == 'serving': serving.append(node['hostname']) log.info('ZK servers: %r leaders: %r', ','.join(serving), ','.join(leaders))
def wait(master_count_filename): if not master_count_filename.exists(): # this is an agent log.info("master_count file doesn't exist, not waiting") return if try_shortcut(): log.info("Shortcut succeeeded, assuming local zk is in good config state, not waiting for quorum.") return log.info('Shortcut failed, waiting for exhibitor to bring up zookeeper and stabilize') cluster_size = int(utils.read_file_text(master_count_filename)) log.info('Expected cluster size: {}'.format(cluster_size)) log.info('Waiting for ZooKeeper cluster to stabilize') try: response = requests.get(EXHIBITOR_STATUS_URL) except requests.exceptions.ConnectionError as ex: log.error('Could not connect to exhibitor: {}'.format(ex)) sys.exit(1) if response.status_code != 200: log.error('Could not get exhibitor status: {}, Status code: {}'.format( EXHIBITOR_STATUS_URL, response.status_code)) sys.exit(1) data = response.json() serving = [] leaders = [] for node in data: if node['isLeader']: leaders.append(node['hostname']) if node['description'] == 'serving': serving.append(node['hostname']) log.info( "Serving hosts: `%s`, leader: `%s`", ','.join(serving), ','.join(leaders)) if len(serving) != cluster_size or len(leaders) != 1: msg_fmt = 'Expected {} servers and 1 leader, got {} servers and {} leaders' log.error(msg_fmt.format(cluster_size, len(serving), len(leaders))) sys.exit(1) # Local Zookeeper is up. Config should be stable, local zookeeper happy. Stash the PID so if # there is a restart we can come up quickly without requiring a new zookeeper quorum. zk_pid_mtime = get_zk_pid_mtime() if zk_pid_mtime is not None: log.info('Stashing zk.pid mtime %s to %s', zk_pid_mtime, stash_zk_pid_stat_mtime_path) utils.write_private_file(stash_zk_pid_stat_mtime_path, str(zk_pid_mtime).encode('utf8'))
def get_zk_pid(): return utils.read_file_text(zk_pid_path)