def init(self): """ Initialize properties """ self.config = AmbariConfig.get_resolved_config() self.is_registered = False self.metadata_cache = ClusterMetadataCache( self.config.cluster_cache_dir) self.topology_cache = ClusterTopologyCache( self.config.cluster_cache_dir, self.config) self.host_level_params_cache = ClusterHostLevelParamsCache( self.config.cluster_cache_dir) self.configurations_cache = ClusterConfigurationCache( self.config.cluster_cache_dir) self.alert_definitions_cache = ClusterAlertDefinitionsCache( self.config.cluster_cache_dir) self.configuration_builder = ConfigurationBuilder(self) self.stale_alerts_monitor = StaleAlertsMonitor(self) self.file_cache = FileCache(self.config) self.customServiceOrchestrator = CustomServiceOrchestrator(self) self.recovery_manager = RecoveryManager(self.config.recovery_cache_dir) self.commandStatuses = CommandStatusDict(self) self.action_queue = ActionQueue(self) self.alert_scheduler_handler = AlertSchedulerHandler(self)
def __init__(self): self.stop_event = threading.Event() self.config = AmbariConfig.get_resolved_config() self.is_registered = None self.metadata_cache = None self.topology_cache = None self.host_level_params_cache = None self.configurations_cache = None self.alert_definitions_cache = None self.configuration_builder = None self.stale_alerts_monitor = None self.server_responses_listener = None self.file_cache = None self.customServiceOrchestrator = None self.recovery_manager = None self.commandStatuses = None self.action_queue = None self.alert_scheduler_handler = None self.init()
def _load_metric(self, ams_collector_host, ams_metric, host_filter): get_metrics_parameters = { "metricNames": ams_metric, "appId": self.ams_app_id, "hostname": host_filter, "precision": "seconds", "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode( get_metrics_parameters) url = AMS_METRICS_GET_URL % encoded_get_metrics_parameters _ssl_version = AmbariConfig.get_resolved_config( ).get_force_https_protocol_value() ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs = 'ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) conn = None response = None data = None try: conn = network.get_http_connection(ams_collector_host, int(self.ams_collector_port), self.use_ssl, ca_certs, ssl_version=_ssl_version) conn.request("GET", url) response = conn.getresponse() data = response.read() except Exception, exception: if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Unable to retrieve metrics from AMS: {1}". format(self.alert_id, str(exception))) status = response.status if response else None return None, status
try: import hashlib _md5 = hashlib.md5 except ImportError: import md5 _md5 = md5.new logger = logging.getLogger(__name__) # default timeout DEFAULT_CONNECTION_TIMEOUT = 5 WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg') ensure_ssl_using_protocol( AmbariConfig.get_resolved_config().get_force_https_protocol_name(), AmbariConfig.get_resolved_config().get_ca_cert_file_path() ) class WebAlert(BaseAlert): def __init__(self, alert_meta, alert_source_meta, config): super(WebAlert, self).__init__(alert_meta, alert_source_meta, config) connection_timeout = DEFAULT_CONNECTION_TIMEOUT # extract any lookup keys from the URI structure self.uri_property_keys = None if 'uri' in alert_source_meta: uri = alert_source_meta['uri'] self.uri_property_keys = self._lookup_uri_property_keys(uri)
def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix, krb_exec_search_paths, return_only_http_code, caller_label, user, connection_timeout=CONNECTION_TIMEOUT_DEFAULT, ca_certs=None, kinit_timer_ms=DEFAULT_KERBEROS_KINIT_TIMER_MS, method='', body='', header=''): """ Makes a curl request using the kerberos credentials stored in a calculated cache file. The cache file is created by combining the supplied principal, keytab, user, and request name into a unique hash. This function will use the klist command to determine if the cache is expired and will perform a kinit if necessary. Additionally, it has an internal timer to force a kinit after a configurable amount of time. This is to prevent boundary issues where requests hit the edge of a ticket's lifetime. :param tmp_dir: the directory to use for storing the local kerberos cache for this request. :param keytab: the location of the keytab to use when performing a kinit :param principal: the principal to use when performing a kinit :param url: the URL to request :param cache_file_prefix: an identifier used to build the unique cache name for this request. This ensures that multiple requests can use the same cache. :param krb_exec_search_paths: the search path to use for invoking kerberos binaries :param return_only_http_code: True to return only the HTTP code, False to return GET content :param caller_label: an identifier to give context into the caller of this module (used for logging) :param user: the user to invoke the curl command as :param connection_timeout: if specified, a connection timeout for curl (default 10 seconds) :param ca_certs: path to certificates :param kinit_timer_ms: if specified, the time (in ms), before forcing a kinit even if the klist cache is still valid. :return: """ import uuid # backward compatibility with old code and management packs, etc. All new code need pass ca_certs explicitly if ca_certs is None: try: from ambari_agent.AmbariConfig import AmbariConfig ca_certs = AmbariConfig.get_resolved_config( ).get_ca_cert_file_path() except: pass # start off false is_kinit_required = False # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl. Use the md5 hash of the combination of the principal and keytab file # to generate a (relatively) unique cache filename so that we can use it as needed. Scope # this file by user in order to prevent sharing of cache files by multiple users. ccache_file_name = _md5("{0}|{1}".format(principal, keytab)).hexdigest() curl_krb_cache_path = os.path.join(tmp_dir, "curl_krb_cache") if not os.path.exists(curl_krb_cache_path): os.makedirs(curl_krb_cache_path) os.chmod(curl_krb_cache_path, 01777) ccache_file_path = "{0}{1}{2}_{3}_cc_{4}".format(curl_krb_cache_path, os.sep, cache_file_prefix, user, ccache_file_name) kerberos_env = {'KRB5CCNAME': ccache_file_path} # concurrent kinit's can cause the following error: # Internal credentials cache error while storing credentials while getting initial credentials kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: # If there are no tickets in the cache or they are expired, perform a kinit, else use what # is in the cache if krb_exec_search_paths: klist_path_local = get_klist_path(krb_exec_search_paths) else: klist_path_local = get_klist_path() # take a look at the last time kinit was run for the specified cache and force a new # kinit if it's time; this helps to avoid problems approaching ticket boundary when # executing a klist and then a curl last_kinit_time = _KINIT_CACHE_TIMES.get(ccache_file_name, 0) current_time = long(time.time()) if current_time - kinit_timer_ms > last_kinit_time: is_kinit_required = True # if the time has not expired, double-check that the cache still has a valid ticket if not is_kinit_required: klist_command = "{0} -s {1}".format(klist_path_local, ccache_file_path) is_kinit_required = (shell.call(klist_command, user=user)[0] != 0) # if kinit is required, the perform the kinit if is_kinit_required: if krb_exec_search_paths: kinit_path_local = get_kinit_path(krb_exec_search_paths) else: kinit_path_local = get_kinit_path() logger.debug( "Enabling Kerberos authentication for %s via GSSAPI using ccache at %s", caller_label, ccache_file_path) # kinit; there's no need to set a ticket timeout as this will use the default invalidation # configured in the krb5.conf - regenerating keytabs will not prevent an existing cache # from working correctly shell.checked_call("{0} -c {1} -kt {2} {3} > /dev/null".format( kinit_path_local, ccache_file_path, keytab, principal), user=user) # record kinit time _KINIT_CACHE_TIMES[ccache_file_name] = current_time else: # no kinit needed, use the cache logger.debug( "Kerberos authentication for %s via GSSAPI already enabled using ccache at %s.", caller_label, ccache_file_path) finally: kinit_lock.release() # check if cookies dir exists, if not then create it cookies_dir = os.path.join(tmp_dir, "cookies") if not os.path.exists(cookies_dir): os.makedirs(cookies_dir) cookie_file_name = str(uuid.uuid4()) cookie_file = os.path.join(cookies_dir, cookie_file_name) start_time = time.time() error_msg = None # setup timeouts for the request; ensure we use integers since that is what curl needs connection_timeout = int(connection_timeout) maximum_timeout = connection_timeout + 2 ssl_options = ['-k'] if ca_certs: ssl_options = ['--cacert', ca_certs] try: if return_only_http_code: _, curl_stdout, curl_stderr = get_user_call_output( ['curl', '--location-trusted'] + ssl_options + [ '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-w', '%{http_code}', url, '--connect-timeout', str(connection_timeout), '--max-time', str(maximum_timeout), '-o', '/dev/null' ], user=user, env=kerberos_env) else: curl_command = ['curl', '--location-trusted'] + ssl_options + [ '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, url, '--connect-timeout', str(connection_timeout), '--max-time', str(maximum_timeout) ] # returns response body if len(method) > 0 and len(body) == 0 and len(header) == 0: curl_command.extend(['-X', method]) elif len(method) > 0 and len(body) == 0 and len(header) > 0: curl_command.extend(['-H', header, '-X', method]) elif len(method) > 0 and len(body) > 0 and len(header) == 0: curl_command.extend(['-X', method, '-d', body]) elif len(method) > 0 and len(body) > 0 and len(header) > 0: curl_command.extend(['-H', header, '-X', method, '-d', body]) _, curl_stdout, curl_stderr = get_user_call_output( curl_command, user=user, env=kerberos_env) except Fail: if logger.isEnabledFor(logging.DEBUG): logger.exception( "Unable to make a curl request for {0}.".format(caller_label)) raise finally: if os.path.isfile(cookie_file): os.remove(cookie_file) # empty quotes evaluates to false if curl_stderr: error_msg = curl_stderr time_millis = time.time() - start_time # empty quotes evaluates to false if curl_stdout: if return_only_http_code: return (int(curl_stdout), error_msg, time_millis) else: return (curl_stdout, error_msg, time_millis) logger.debug("The curl response for %s is empty; standard error = %s", caller_label, str(error_msg)) return ("", error_msg, time_millis)
# preserving 2.4 compatibility. try: import hashlib _md5 = hashlib.md5 except ImportError: import md5 _md5 = md5.new logger = logging.getLogger(__name__) # default timeout DEFAULT_CONNECTION_TIMEOUT = 5 WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg') ensure_ssl_using_protocol(AmbariConfig.get_resolved_config().get_force_https_protocol()) class WebAlert(BaseAlert): def __init__(self, alert_meta, alert_source_meta, config): super(WebAlert, self).__init__(alert_meta, alert_source_meta, config) connection_timeout = DEFAULT_CONNECTION_TIMEOUT # extract any lookup keys from the URI structure self.uri_property_keys = None if 'uri' in alert_source_meta: uri = alert_source_meta['uri'] self.uri_property_keys = self._lookup_uri_property_keys(uri) if 'connection_timeout' in uri:
import logging import httplib import sys from ssl import SSLError from HeartbeatHandlers import HeartbeatStopHandlers from ambari_agent.AmbariConfig import AmbariConfig from ambari_commons.inet_utils import ensure_ssl_using_protocol ERROR_SSL_WRONG_VERSION = "SSLError: Failed to connect. Please check openssl library versions. \n" +\ "Refer to: https://bugzilla.redhat.com/show_bug.cgi?id=1022468 for more details." LOG_REQUEST_MESSAGE = "GET %s -> %s, body: %s" logger = logging.getLogger(__name__) ensure_ssl_using_protocol( AmbariConfig.get_resolved_config().get_force_https_protocol()) class NetUtil: DEFAULT_CONNECT_RETRY_DELAY_SEC = 10 HEARTBEAT_IDLE_INTERVAL_DEFAULT_MIN_SEC = 1 HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC = 10 MINIMUM_INTERVAL_BETWEEN_HEARTBEATS = 0.1 # Url within server to request during status check. This url # should return HTTP code 200 SERVER_STATUS_REQUEST = "{0}/ca" # For testing purposes DEBUG_STOP_RETRIES_FLAG = False
def _get_ssl_version(): return AmbariConfig.get_resolved_config().get_force_https_protocol_value()
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower( ) == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer( parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer( parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer( parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY) ]) else: collector_webapp_address = configurations[ METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink( app_id.lower()) collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, [ '{0} value should be set as "fqdn_hostname:port", but set to {1}' .format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY]) ]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site[ 'dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = configurations[NAMESERVICE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique NameNode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format( name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[ nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs = 'ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) metric_collector_https_enabled = str( configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY" try: conn = network.get_http_connection( collector_host, int(collector_port), metric_collector_https_enabled, ca_certs, ssl_version=AmbariConfig.get_resolved_config( ).get_force_https_protocol_value()) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception, e: logger.info(str(e)) return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ hostnames = host_name current_time = int(time.time()) * 1000 # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT if MERGE_HA_METRICS_PARAM_KEY in parameters: merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower( ) == 'true' metric_name = METRIC_NAME_PARAM_DEFAULT if METRIC_NAME_PARAM_KEY in parameters: metric_name = parameters[METRIC_NAME_PARAM_KEY] metric_units = METRIC_UNITS_DEFAULT if METRIC_UNITS_PARAM_KEY in parameters: metric_units = parameters[METRIC_UNITS_PARAM_KEY] app_id = APP_ID_PARAM_DEFAULT if APP_ID_PARAM_KEY in parameters: app_id = parameters[APP_ID_PARAM_KEY] interval = INTERVAL_PARAM_DEFAULT if INTERVAL_PARAM_KEY in parameters: interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY]) warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT if DEVIATION_WARNING_THRESHOLD_KEY in parameters: warning_threshold = _coerce_to_integer( parameters[DEVIATION_WARNING_THRESHOLD_KEY]) critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters: critical_threshold = _coerce_to_integer( parameters[DEVIATION_CRITICAL_THRESHOLD_KEY]) minimum_value_threshold = None if MINIMUM_VALUE_THRESHOLD_KEY in parameters: minimum_value_threshold = _coerce_to_integer( parameters[MINIMUM_VALUE_THRESHOLD_KEY]) #parse configuration if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations: collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY] collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY]) else: # ams-site/timeline.metrics.service.webapp.address is required if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY) ]) else: collector_webapp_address = configurations[ METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":") if valid_collector_webapp_address(collector_webapp_address): collector_host = select_metric_collector_for_sink( app_id.lower()) collector_port = int(collector_webapp_address[1]) else: return (RESULT_STATE_UNKNOWN, [ '{0} value should be set as "fqdn_hostname:port", but set to {1}' .format( METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY]) ]) namenode_service_rpc_address = None # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) hdfs_site = configurations[HDFS_SITE_KEY] if 'dfs.namenode.servicerpc-address' in hdfs_site: namenode_service_rpc_address = hdfs_site[ 'dfs.namenode.servicerpc-address'] # if namenode alert and HA mode if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode': # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format( HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) name_service = configurations[NAMESERVICE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique NameNode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 namenodes = [] active_namenodes = [] nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) namenode = str(hdfs_site[key]).split(":")[0] namenodes.append(namenode) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms=kinit_timer_ms) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(namenode) # Only check active NN nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format( name_service, nn_unique_id) if nn_service_rpc_address_key in hdfs_site: namenode_service_rpc_address = hdfs_site[ nn_service_rpc_address_key] pass except: logger.exception("Unable to determine the active NameNode") pass if merge_ha_metrics: hostnames = ",".join(namenodes) # run only on active NN, no need to run the same requests from the standby if host_name not in active_namenodes: return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.']) pass # Skip service rpc alert if port is not enabled if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name: return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.']) get_metrics_parameters = { "metricNames": metric_name, "appId": app_id, "hostname": hostnames, "startTime": current_time - interval * 60 * 1000, "endTime": current_time, "grouped": "true", } encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters) ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf" metric_truststore_ca_certs = 'ca.pem' ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs) metric_collector_https_enabled = str( configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY" try: conn = network.get_http_connection( collector_host, int(collector_port), metric_collector_https_enabled, ca_certs, ssl_version=AmbariConfig.get_resolved_config( ).get_force_https_protocol_value()) conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters) response = conn.getresponse() data = response.read() conn.close() except Exception: return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ]) if response.status != 200: return (RESULT_STATE_UNKNOWN, [ "Unable to retrieve metrics from the Ambari Metrics service." ]) data_json = json.loads(data) metrics = [] # will get large standard deviation for multiple hosts, # if host1 reports small local values, but host2 reports large local values for metrics_data in data_json["metrics"]: metrics += metrics_data["metrics"].values() pass if not metrics or len(metrics) < 2: number_of_data_points = len(metrics) if metrics else 0 return (RESULT_STATE_SKIPPED, [ "There are not enough data points to calculate the standard deviation ({0} sampled)" .format(number_of_data_points) ]) minimum_value_multiplier = 1 if 'dfs.FSNamesystem.CapacityUsed' in metric_name: minimum_value_multiplier = 1024 * 1024 # MB to bytes elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name: minimum_value_multiplier = 1000 # seconds to millis if minimum_value_threshold: # Filter out points below min threshold metrics = [ metric for metric in metrics if metric > (minimum_value_threshold * minimum_value_multiplier) ] if len(metrics) < 2: return (RESULT_STATE_OK, [ 'There were no data points above the minimum threshold of {0} seconds' .format(minimum_value_threshold) ]) mean_value = mean(metrics) stddev = sample_standard_deviation(metrics) try: deviation_percent = stddev / float(mean_value) * 100 except ZeroDivisionError: # should not be a case for this alert return (RESULT_STATE_SKIPPED, [ "Unable to calculate the standard deviation because the mean value is 0" ]) # log the AMS request if logger.isEnabledFor(logging.DEBUG): logger.debug(""" AMS request parameters - {0} AMS response - {1} Mean - {2} Standard deviation - {3} Percentage standard deviation - {4} """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev, deviation_percent)) mean_value_localized = locale.format("%.0f", mean_value, grouping=True) variance_value = (deviation_percent / 100.0) * mean_value variance_value_localized = locale.format("%.0f", variance_value, grouping=True) # check for CRITICAL status if deviation_percent > critical_threshold: threshold_value = ((critical_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_THRESHOLD_MESSAGE.format( variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_CRITICAL, [message]) # check for WARNING status if deviation_percent > warning_threshold: threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_THRESHOLD_MESSAGE.format( variance_value_localized, metric_units, deviation_percent, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_WARNING, [message]) # return OK status; use the warning threshold as the value to compare against threshold_value = ((warning_threshold / 100.0) * mean_value) threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True) message = DEVIATION_OK_MESSAGE.format(variance_value_localized, metric_units, warning_threshold, mean_value_localized, metric_units, threshold_value_localized, metric_units) return (RESULT_STATE_OK, [message])
def __init__(self): self.stop_event = threading.Event() self.config = AmbariConfig.get_resolved_config() self.init()