def get_next_ha_yarncluster(): """ Return the next available YARN RM instance and cache its name. """ from hadoop.yarn import mapreduce_api from hadoop.yarn import resource_manager_api from hadoop.yarn.resource_manager_api import ResourceManagerApi global MR_NAME_CACHE has_ha = sum([conf.YARN_CLUSTERS[name].SUBMIT_TO.get() for name in conf.YARN_CLUSTERS.keys()]) >= 2 for name in conf.YARN_CLUSTERS.keys(): config = conf.YARN_CLUSTERS[name] if config.SUBMIT_TO.get(): rm = ResourceManagerApi(config.RESOURCE_MANAGER_API_URL.get(), config.SECURITY_ENABLED.get(), config.SSL_CERT_CA_VERIFY.get()) if has_ha: try: cluster_info = rm.cluster() if cluster_info['clusterInfo']['haState'] == 'ACTIVE': MR_NAME_CACHE = name LOG.warn('Picking RM HA: %s' % name) resource_manager_api._api_cache = None # Reset cache mapreduce_api._api_cache = None return (config, rm) else: LOG.info('RM %s is not RUNNING, skipping it: %s' % (name, cluster_info)) except resource_manager_api.YarnFailoverOccurred: LOG.info('RM %s has failed back to another server' % (name,)) except Exception, ex: LOG.exception('RM %s is not available, skipping it: %s' % (name, ex)) else: return (config, rm)
def get_next_ha_yarncluster(current_user=None): """ Return the next available YARN RM instance and cache its name. """ from hadoop.yarn.resource_manager_api import ResourceManagerApi global MR_NAME_CACHE has_ha = sum([conf.YARN_CLUSTERS[name].SUBMIT_TO.get() for name in conf.YARN_CLUSTERS.keys()]) >= 2 for name in conf.YARN_CLUSTERS.keys(): config = conf.YARN_CLUSTERS[name] if config.SUBMIT_TO.get(): rm = ResourceManagerApi(config.RESOURCE_MANAGER_API_URL.get(), config.SECURITY_ENABLED.get(), config.SSL_CERT_CA_VERIFY.get()) if current_user is None: rm.setuser(DEFAULT_USER) else: rm.setuser(current_user) if has_ha: try: cluster_info = rm.cluster() if cluster_info['clusterInfo']['haState'] == 'ACTIVE': if name != MR_NAME_CACHE: LOG.info('RM %s has failed back to %s server' % (MR_NAME_CACHE, name)) rm.from_failover = True MR_NAME_CACHE = name LOG.warn('Picking RM HA: %s' % name) return (config, rm) else: LOG.info('RM %s is not RUNNING, skipping it: %s' % (name, cluster_info)) except Exception, ex: LOG.exception('RM %s is not available, skipping it: %s' % (name, ex)) else: return (config, rm)