Exemplo n.º 1
0
    def init(self):
        """
    Initialize properties
    """
        self.config = AmbariConfig.get_resolved_config()

        self.is_registered = False

        self.metadata_cache = ClusterMetadataCache(
            self.config.cluster_cache_dir)
        self.topology_cache = ClusterTopologyCache(
            self.config.cluster_cache_dir, self.config)
        self.host_level_params_cache = ClusterHostLevelParamsCache(
            self.config.cluster_cache_dir)
        self.configurations_cache = ClusterConfigurationCache(
            self.config.cluster_cache_dir)
        self.alert_definitions_cache = ClusterAlertDefinitionsCache(
            self.config.cluster_cache_dir)
        self.configuration_builder = ConfigurationBuilder(self)
        self.stale_alerts_monitor = StaleAlertsMonitor(self)

        self.file_cache = FileCache(self.config)

        self.customServiceOrchestrator = CustomServiceOrchestrator(self)

        self.recovery_manager = RecoveryManager(self.config.recovery_cache_dir)
        self.commandStatuses = CommandStatusDict(self)
        self.action_queue = ActionQueue(self)
        self.alert_scheduler_handler = AlertSchedulerHandler(self)
Exemplo n.º 2
0
    def __init__(self):
        self.stop_event = threading.Event()
        self.config = AmbariConfig.get_resolved_config()

        self.is_registered = None
        self.metadata_cache = None
        self.topology_cache = None
        self.host_level_params_cache = None
        self.configurations_cache = None
        self.alert_definitions_cache = None
        self.configuration_builder = None
        self.stale_alerts_monitor = None
        self.server_responses_listener = None
        self.file_cache = None
        self.customServiceOrchestrator = None
        self.recovery_manager = None
        self.commandStatuses = None
        self.action_queue = None
        self.alert_scheduler_handler = None

        self.init()
Exemplo n.º 3
0
    def _load_metric(self, ams_collector_host, ams_metric, host_filter):
        get_metrics_parameters = {
            "metricNames": ams_metric,
            "appId": self.ams_app_id,
            "hostname": host_filter,
            "precision": "seconds",
            "grouped": "true",
        }
        encoded_get_metrics_parameters = urllib.urlencode(
            get_metrics_parameters)
        url = AMS_METRICS_GET_URL % encoded_get_metrics_parameters

        _ssl_version = AmbariConfig.get_resolved_config(
        ).get_force_https_protocol_value()

        ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
        metric_truststore_ca_certs = 'ca.pem'
        ca_certs = os.path.join(ams_monitor_conf_dir,
                                metric_truststore_ca_certs)

        conn = None
        response = None
        data = None
        try:
            conn = network.get_http_connection(ams_collector_host,
                                               int(self.ams_collector_port),
                                               self.use_ssl,
                                               ca_certs,
                                               ssl_version=_ssl_version)
            conn.request("GET", url)
            response = conn.getresponse()
            data = response.read()
        except Exception, exception:
            if logger.isEnabledFor(logging.DEBUG):
                logger.exception(
                    "[Alert][{0}] Unable to retrieve metrics from AMS: {1}".
                    format(self.alert_id, str(exception)))
            status = response.status if response else None
            return None, status
Exemplo n.º 4
0
try:
  import hashlib
  _md5 = hashlib.md5
except ImportError:
  import md5
  _md5 = md5.new

logger = logging.getLogger(__name__)

# default timeout
DEFAULT_CONNECTION_TIMEOUT = 5

WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg')

ensure_ssl_using_protocol(
    AmbariConfig.get_resolved_config().get_force_https_protocol_name(),
    AmbariConfig.get_resolved_config().get_ca_cert_file_path()
)

class WebAlert(BaseAlert):

  def __init__(self, alert_meta, alert_source_meta, config):
    super(WebAlert, self).__init__(alert_meta, alert_source_meta, config)

    connection_timeout = DEFAULT_CONNECTION_TIMEOUT

    # extract any lookup keys from the URI structure
    self.uri_property_keys = None
    if 'uri' in alert_source_meta:
      uri = alert_source_meta['uri']
      self.uri_property_keys = self._lookup_uri_property_keys(uri)
Exemplo n.º 5
0
def curl_krb_request(tmp_dir,
                     keytab,
                     principal,
                     url,
                     cache_file_prefix,
                     krb_exec_search_paths,
                     return_only_http_code,
                     caller_label,
                     user,
                     connection_timeout=CONNECTION_TIMEOUT_DEFAULT,
                     ca_certs=None,
                     kinit_timer_ms=DEFAULT_KERBEROS_KINIT_TIMER_MS,
                     method='',
                     body='',
                     header=''):
    """
  Makes a curl request using the kerberos credentials stored in a calculated cache file. The
  cache file is created by combining the supplied principal, keytab, user, and request name into
  a unique hash.

  This function will use the klist command to determine if the cache is expired and will perform
  a kinit if necessary. Additionally, it has an internal timer to force a kinit after a
  configurable amount of time. This is to prevent boundary issues where requests hit the edge
  of a ticket's lifetime.

  :param tmp_dir: the directory to use for storing the local kerberos cache for this request.
  :param keytab: the location of the keytab to use when performing a kinit
  :param principal: the principal to use when performing a kinit
  :param url: the URL to request
  :param cache_file_prefix: an identifier used to build the unique cache name for this request.
                            This ensures that multiple requests can use the same cache.
  :param krb_exec_search_paths: the search path to use for invoking kerberos binaries
  :param return_only_http_code: True to return only the HTTP code, False to return GET content
  :param caller_label: an identifier to give context into the caller of this module (used for logging)
  :param user: the user to invoke the curl command as
  :param connection_timeout: if specified, a connection timeout for curl (default 10 seconds)
  :param ca_certs: path to certificates
  :param kinit_timer_ms: if specified, the time (in ms), before forcing a kinit even if the
                         klist cache is still valid.
  :return:
  """

    import uuid
    # backward compatibility with old code and management packs, etc. All new code need pass ca_certs explicitly
    if ca_certs is None:
        try:
            from ambari_agent.AmbariConfig import AmbariConfig
            ca_certs = AmbariConfig.get_resolved_config(
            ).get_ca_cert_file_path()
        except:
            pass
    # start off false
    is_kinit_required = False

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl. Use the md5 hash of the combination of the principal and keytab file
    # to generate a (relatively) unique cache filename so that we can use it as needed. Scope
    # this file by user in order to prevent sharing of cache files by multiple users.
    ccache_file_name = _md5("{0}|{1}".format(principal, keytab)).hexdigest()

    curl_krb_cache_path = os.path.join(tmp_dir, "curl_krb_cache")
    if not os.path.exists(curl_krb_cache_path):
        os.makedirs(curl_krb_cache_path)
    os.chmod(curl_krb_cache_path, 01777)

    ccache_file_path = "{0}{1}{2}_{3}_cc_{4}".format(curl_krb_cache_path,
                                                     os.sep, cache_file_prefix,
                                                     user, ccache_file_name)
    kerberos_env = {'KRB5CCNAME': ccache_file_path}

    # concurrent kinit's can cause the following error:
    # Internal credentials cache error while storing credentials while getting initial credentials
    kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
    kinit_lock.acquire()
    try:
        # If there are no tickets in the cache or they are expired, perform a kinit, else use what
        # is in the cache
        if krb_exec_search_paths:
            klist_path_local = get_klist_path(krb_exec_search_paths)
        else:
            klist_path_local = get_klist_path()

        # take a look at the last time kinit was run for the specified cache and force a new
        # kinit if it's time; this helps to avoid problems approaching ticket boundary when
        # executing a klist and then a curl
        last_kinit_time = _KINIT_CACHE_TIMES.get(ccache_file_name, 0)
        current_time = long(time.time())
        if current_time - kinit_timer_ms > last_kinit_time:
            is_kinit_required = True

        # if the time has not expired, double-check that the cache still has a valid ticket
        if not is_kinit_required:
            klist_command = "{0} -s {1}".format(klist_path_local,
                                                ccache_file_path)
            is_kinit_required = (shell.call(klist_command, user=user)[0] != 0)

        # if kinit is required, the perform the kinit
        if is_kinit_required:
            if krb_exec_search_paths:
                kinit_path_local = get_kinit_path(krb_exec_search_paths)
            else:
                kinit_path_local = get_kinit_path()

            logger.debug(
                "Enabling Kerberos authentication for %s via GSSAPI using ccache at %s",
                caller_label, ccache_file_path)

            # kinit; there's no need to set a ticket timeout as this will use the default invalidation
            # configured in the krb5.conf - regenerating keytabs will not prevent an existing cache
            # from working correctly
            shell.checked_call("{0} -c {1} -kt {2} {3} > /dev/null".format(
                kinit_path_local, ccache_file_path, keytab, principal),
                               user=user)

            # record kinit time
            _KINIT_CACHE_TIMES[ccache_file_name] = current_time
        else:
            # no kinit needed, use the cache
            logger.debug(
                "Kerberos authentication for %s via GSSAPI already enabled using ccache at %s.",
                caller_label, ccache_file_path)
    finally:
        kinit_lock.release()

    # check if cookies dir exists, if not then create it
    cookies_dir = os.path.join(tmp_dir, "cookies")

    if not os.path.exists(cookies_dir):
        os.makedirs(cookies_dir)

    cookie_file_name = str(uuid.uuid4())
    cookie_file = os.path.join(cookies_dir, cookie_file_name)

    start_time = time.time()
    error_msg = None

    # setup timeouts for the request; ensure we use integers since that is what curl needs
    connection_timeout = int(connection_timeout)
    maximum_timeout = connection_timeout + 2

    ssl_options = ['-k']
    if ca_certs:
        ssl_options = ['--cacert', ca_certs]
    try:
        if return_only_http_code:
            _, curl_stdout, curl_stderr = get_user_call_output(
                ['curl', '--location-trusted'] + ssl_options + [
                    '--negotiate', '-u', ':', '-b', cookie_file, '-c',
                    cookie_file, '-w', '%{http_code}', url,
                    '--connect-timeout',
                    str(connection_timeout), '--max-time',
                    str(maximum_timeout), '-o', '/dev/null'
                ],
                user=user,
                env=kerberos_env)
        else:
            curl_command = ['curl', '--location-trusted'] + ssl_options + [
                '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file,
                url, '--connect-timeout',
                str(connection_timeout), '--max-time',
                str(maximum_timeout)
            ]
            # returns response body
            if len(method) > 0 and len(body) == 0 and len(header) == 0:
                curl_command.extend(['-X', method])

            elif len(method) > 0 and len(body) == 0 and len(header) > 0:
                curl_command.extend(['-H', header, '-X', method])

            elif len(method) > 0 and len(body) > 0 and len(header) == 0:
                curl_command.extend(['-X', method, '-d', body])

            elif len(method) > 0 and len(body) > 0 and len(header) > 0:
                curl_command.extend(['-H', header, '-X', method, '-d', body])

            _, curl_stdout, curl_stderr = get_user_call_output(
                curl_command, user=user, env=kerberos_env)

    except Fail:
        if logger.isEnabledFor(logging.DEBUG):
            logger.exception(
                "Unable to make a curl request for {0}.".format(caller_label))
        raise
    finally:
        if os.path.isfile(cookie_file):
            os.remove(cookie_file)

    # empty quotes evaluates to false
    if curl_stderr:
        error_msg = curl_stderr

    time_millis = time.time() - start_time

    # empty quotes evaluates to false
    if curl_stdout:
        if return_only_http_code:
            return (int(curl_stdout), error_msg, time_millis)
        else:
            return (curl_stdout, error_msg, time_millis)

    logger.debug("The curl response for %s is empty; standard error = %s",
                 caller_label, str(error_msg))

    return ("", error_msg, time_millis)
Exemplo n.º 6
0
# preserving 2.4 compatibility.
try:
  import hashlib
  _md5 = hashlib.md5
except ImportError:
  import md5
  _md5 = md5.new

logger = logging.getLogger(__name__)

# default timeout
DEFAULT_CONNECTION_TIMEOUT = 5

WebResponse = namedtuple('WebResponse', 'status_code time_millis error_msg')

ensure_ssl_using_protocol(AmbariConfig.get_resolved_config().get_force_https_protocol())

class WebAlert(BaseAlert):

  def __init__(self, alert_meta, alert_source_meta, config):
    super(WebAlert, self).__init__(alert_meta, alert_source_meta, config)

    connection_timeout = DEFAULT_CONNECTION_TIMEOUT

    # extract any lookup keys from the URI structure
    self.uri_property_keys = None
    if 'uri' in alert_source_meta:
      uri = alert_source_meta['uri']
      self.uri_property_keys = self._lookup_uri_property_keys(uri)

      if 'connection_timeout' in uri:
Exemplo n.º 7
0
import logging
import httplib
import sys
from ssl import SSLError
from HeartbeatHandlers import HeartbeatStopHandlers
from ambari_agent.AmbariConfig import AmbariConfig
from ambari_commons.inet_utils import ensure_ssl_using_protocol

ERROR_SSL_WRONG_VERSION = "SSLError: Failed to connect. Please check openssl library versions. \n" +\
              "Refer to: https://bugzilla.redhat.com/show_bug.cgi?id=1022468 for more details."
LOG_REQUEST_MESSAGE = "GET %s -> %s, body: %s"

logger = logging.getLogger(__name__)

ensure_ssl_using_protocol(
    AmbariConfig.get_resolved_config().get_force_https_protocol())


class NetUtil:

    DEFAULT_CONNECT_RETRY_DELAY_SEC = 10
    HEARTBEAT_IDLE_INTERVAL_DEFAULT_MIN_SEC = 1
    HEARTBEAT_IDLE_INTERVAL_DEFAULT_MAX_SEC = 10
    MINIMUM_INTERVAL_BETWEEN_HEARTBEATS = 0.1

    # Url within server to request during status check. This url
    # should return HTTP code 200
    SERVER_STATUS_REQUEST = "{0}/ca"
    # For testing purposes
    DEBUG_STOP_RETRIES_FLAG = False
def _get_ssl_version():
    return AmbariConfig.get_resolved_config().get_force_https_protocol_value()
Exemplo n.º 9
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
    hostnames = host_name
    current_time = int(time.time()) * 1000

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
    if MERGE_HA_METRICS_PARAM_KEY in parameters:
        merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower(
        ) == 'true'

    metric_name = METRIC_NAME_PARAM_DEFAULT
    if METRIC_NAME_PARAM_KEY in parameters:
        metric_name = parameters[METRIC_NAME_PARAM_KEY]

    metric_units = METRIC_UNITS_DEFAULT
    if METRIC_UNITS_PARAM_KEY in parameters:
        metric_units = parameters[METRIC_UNITS_PARAM_KEY]

    app_id = APP_ID_PARAM_DEFAULT
    if APP_ID_PARAM_KEY in parameters:
        app_id = parameters[APP_ID_PARAM_KEY]

    interval = INTERVAL_PARAM_DEFAULT
    if INTERVAL_PARAM_KEY in parameters:
        interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

    warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
    if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
        warning_threshold = _coerce_to_integer(
            parameters[DEVIATION_WARNING_THRESHOLD_KEY])

    critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
    if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
        critical_threshold = _coerce_to_integer(
            parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

    minimum_value_threshold = None
    if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
        minimum_value_threshold = _coerce_to_integer(
            parameters[MINIMUM_VALUE_THRESHOLD_KEY])

    #parse configuration
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
        collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
        collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
    else:
        # ams-site/timeline.metrics.service.webapp.address is required
        if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
            ])
        else:
            collector_webapp_address = configurations[
                METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
            if valid_collector_webapp_address(collector_webapp_address):
                collector_host = select_metric_collector_for_sink(
                    app_id.lower())
                collector_port = int(collector_webapp_address[1])
            else:
                return (RESULT_STATE_UNKNOWN, [
                    '{0} value should be set as "fqdn_hostname:port", but set to {1}'
                    .format(
                        METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY,
                        configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])
                ])

    namenode_service_rpc_address = None
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    hdfs_site = configurations[HDFS_SITE_KEY]

    if 'dfs.namenode.servicerpc-address' in hdfs_site:
        namenode_service_rpc_address = hdfs_site[
            'dfs.namenode.servicerpc-address']

    # if namenode alert and HA mode
    if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
        # hdfs-site is required
        if not HDFS_SITE_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    HDFS_SITE_KEY)
            ])

        if SMOKEUSER_KEY in configurations:
            smokeuser = configurations[SMOKEUSER_KEY]

        executable_paths = None
        if EXECUTABLE_SEARCH_PATHS in configurations:
            executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

        # parse script arguments
        security_enabled = False
        if SECURITY_ENABLED_KEY in configurations:
            security_enabled = str(
                configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

        kerberos_keytab = None
        if KERBEROS_KEYTAB in configurations:
            kerberos_keytab = configurations[KERBEROS_KEYTAB]

        kerberos_principal = None
        if KERBEROS_PRINCIPAL in configurations:
            kerberos_principal = configurations[KERBEROS_PRINCIPAL]
            kerberos_principal = kerberos_principal.replace('_HOST', host_name)

        # determine whether or not SSL is enabled
        is_ssl_enabled = False
        if DFS_POLICY_KEY in configurations:
            dfs_policy = configurations[DFS_POLICY_KEY]
            if dfs_policy == "HTTPS_ONLY":
                is_ssl_enabled = True

        kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                        DEFAULT_KERBEROS_KINIT_TIMER_MS)

        name_service = configurations[NAMESERVICE_KEY]

        # look for dfs.ha.namenodes.foo
        nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
        if not nn_unique_ids_key in hdfs_site:
            return (RESULT_STATE_UNKNOWN, [
                'Unable to find unique NameNode alias key {0}'.format(
                    nn_unique_ids_key)
            ])

        namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
        jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        if is_ssl_enabled:
            namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
            jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        # now we have something like 'nn1,nn2,nn3,nn4'
        # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
        # ie dfs.namenode.http-address.hacluster.nn1
        namenodes = []
        active_namenodes = []
        nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
        for nn_unique_id in nn_unique_ids:
            key = namenode_http_fragment.format(name_service, nn_unique_id)

            if key in hdfs_site:
                # use str() to ensure that unicode strings do not have the u' in them
                value = str(hdfs_site[key])
                namenode = str(hdfs_site[key]).split(":")[0]

                namenodes.append(namenode)
                try:
                    jmx_uri = jmx_uri_fragment.format(value)
                    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                        env = Environment.get_instance()

                        # curl requires an integer timeout
                        curl_connection_timeout = int(connection_timeout)
                        state_response, error_msg, time_millis = curl_krb_request(
                            env.tmp_dir,
                            kerberos_keytab,
                            kerberos_principal,
                            jmx_uri,
                            "ha_nn_health",
                            executable_paths,
                            False,
                            "NameNode High Availability Health",
                            smokeuser,
                            connection_timeout=curl_connection_timeout,
                            kinit_timer_ms=kinit_timer_ms)

                        state = _get_ha_state_from_json(state_response)
                    else:
                        state_response = get_jmx(jmx_uri, connection_timeout)
                        state = _get_ha_state_from_json(state_response)

                    if state == HDFS_NN_STATE_ACTIVE:
                        active_namenodes.append(namenode)

                        # Only check active NN
                        nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(
                            name_service, nn_unique_id)
                        if nn_service_rpc_address_key in hdfs_site:
                            namenode_service_rpc_address = hdfs_site[
                                nn_service_rpc_address_key]
                    pass
                except:
                    logger.exception("Unable to determine the active NameNode")
        pass

        if merge_ha_metrics:
            hostnames = ",".join(namenodes)
            # run only on active NN, no need to run the same requests from the standby
            if host_name not in active_namenodes:
                return (RESULT_STATE_SKIPPED,
                        ['This alert will be reported by another host.'])
        pass

    # Skip service rpc alert if port is not enabled
    if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
        return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

    get_metrics_parameters = {
        "metricNames": metric_name,
        "appId": app_id,
        "hostname": hostnames,
        "startTime": current_time - interval * 60 * 1000,
        "endTime": current_time,
        "grouped": "true",
    }

    encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

    ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
    metric_truststore_ca_certs = 'ca.pem'
    ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs)
    metric_collector_https_enabled = str(
        configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"

    try:
        conn = network.get_http_connection(
            collector_host,
            int(collector_port),
            metric_collector_https_enabled,
            ca_certs,
            ssl_version=AmbariConfig.get_resolved_config(
            ).get_force_https_protocol_value())
        conn.request("GET",
                     AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception, e:
        logger.info(str(e))
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])
Exemplo n.º 10
0
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
    hostnames = host_name
    current_time = int(time.time()) * 1000

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
    if MERGE_HA_METRICS_PARAM_KEY in parameters:
        merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower(
        ) == 'true'

    metric_name = METRIC_NAME_PARAM_DEFAULT
    if METRIC_NAME_PARAM_KEY in parameters:
        metric_name = parameters[METRIC_NAME_PARAM_KEY]

    metric_units = METRIC_UNITS_DEFAULT
    if METRIC_UNITS_PARAM_KEY in parameters:
        metric_units = parameters[METRIC_UNITS_PARAM_KEY]

    app_id = APP_ID_PARAM_DEFAULT
    if APP_ID_PARAM_KEY in parameters:
        app_id = parameters[APP_ID_PARAM_KEY]

    interval = INTERVAL_PARAM_DEFAULT
    if INTERVAL_PARAM_KEY in parameters:
        interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

    warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
    if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
        warning_threshold = _coerce_to_integer(
            parameters[DEVIATION_WARNING_THRESHOLD_KEY])

    critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
    if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
        critical_threshold = _coerce_to_integer(
            parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

    minimum_value_threshold = None
    if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
        minimum_value_threshold = _coerce_to_integer(
            parameters[MINIMUM_VALUE_THRESHOLD_KEY])

    #parse configuration
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
        collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
        collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
    else:
        # ams-site/timeline.metrics.service.webapp.address is required
        if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
            ])
        else:
            collector_webapp_address = configurations[
                METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
            if valid_collector_webapp_address(collector_webapp_address):
                collector_host = select_metric_collector_for_sink(
                    app_id.lower())
                collector_port = int(collector_webapp_address[1])
            else:
                return (RESULT_STATE_UNKNOWN, [
                    '{0} value should be set as "fqdn_hostname:port", but set to {1}'
                    .format(
                        METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY,
                        configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])
                ])

    namenode_service_rpc_address = None
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    hdfs_site = configurations[HDFS_SITE_KEY]

    if 'dfs.namenode.servicerpc-address' in hdfs_site:
        namenode_service_rpc_address = hdfs_site[
            'dfs.namenode.servicerpc-address']

    # if namenode alert and HA mode
    if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
        # hdfs-site is required
        if not HDFS_SITE_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    HDFS_SITE_KEY)
            ])

        if SMOKEUSER_KEY in configurations:
            smokeuser = configurations[SMOKEUSER_KEY]

        executable_paths = None
        if EXECUTABLE_SEARCH_PATHS in configurations:
            executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

        # parse script arguments
        security_enabled = False
        if SECURITY_ENABLED_KEY in configurations:
            security_enabled = str(
                configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

        kerberos_keytab = None
        if KERBEROS_KEYTAB in configurations:
            kerberos_keytab = configurations[KERBEROS_KEYTAB]

        kerberos_principal = None
        if KERBEROS_PRINCIPAL in configurations:
            kerberos_principal = configurations[KERBEROS_PRINCIPAL]
            kerberos_principal = kerberos_principal.replace('_HOST', host_name)

        # determine whether or not SSL is enabled
        is_ssl_enabled = False
        if DFS_POLICY_KEY in configurations:
            dfs_policy = configurations[DFS_POLICY_KEY]
            if dfs_policy == "HTTPS_ONLY":
                is_ssl_enabled = True

        kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                        DEFAULT_KERBEROS_KINIT_TIMER_MS)

        name_service = configurations[NAMESERVICE_KEY]

        # look for dfs.ha.namenodes.foo
        nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
        if not nn_unique_ids_key in hdfs_site:
            return (RESULT_STATE_UNKNOWN, [
                'Unable to find unique NameNode alias key {0}'.format(
                    nn_unique_ids_key)
            ])

        namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
        jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        if is_ssl_enabled:
            namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
            jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        # now we have something like 'nn1,nn2,nn3,nn4'
        # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
        # ie dfs.namenode.http-address.hacluster.nn1
        namenodes = []
        active_namenodes = []
        nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
        for nn_unique_id in nn_unique_ids:
            key = namenode_http_fragment.format(name_service, nn_unique_id)

            if key in hdfs_site:
                # use str() to ensure that unicode strings do not have the u' in them
                value = str(hdfs_site[key])
                namenode = str(hdfs_site[key]).split(":")[0]

                namenodes.append(namenode)
                try:
                    jmx_uri = jmx_uri_fragment.format(value)
                    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                        env = Environment.get_instance()

                        # curl requires an integer timeout
                        curl_connection_timeout = int(connection_timeout)
                        state_response, error_msg, time_millis = curl_krb_request(
                            env.tmp_dir,
                            kerberos_keytab,
                            kerberos_principal,
                            jmx_uri,
                            "ha_nn_health",
                            executable_paths,
                            False,
                            "NameNode High Availability Health",
                            smokeuser,
                            connection_timeout=curl_connection_timeout,
                            kinit_timer_ms=kinit_timer_ms)

                        state = _get_ha_state_from_json(state_response)
                    else:
                        state_response = get_jmx(jmx_uri, connection_timeout)
                        state = _get_ha_state_from_json(state_response)

                    if state == HDFS_NN_STATE_ACTIVE:
                        active_namenodes.append(namenode)

                        # Only check active NN
                        nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(
                            name_service, nn_unique_id)
                        if nn_service_rpc_address_key in hdfs_site:
                            namenode_service_rpc_address = hdfs_site[
                                nn_service_rpc_address_key]
                    pass
                except:
                    logger.exception("Unable to determine the active NameNode")
        pass

        if merge_ha_metrics:
            hostnames = ",".join(namenodes)
            # run only on active NN, no need to run the same requests from the standby
            if host_name not in active_namenodes:
                return (RESULT_STATE_SKIPPED,
                        ['This alert will be reported by another host.'])
        pass

    # Skip service rpc alert if port is not enabled
    if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
        return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

    get_metrics_parameters = {
        "metricNames": metric_name,
        "appId": app_id,
        "hostname": hostnames,
        "startTime": current_time - interval * 60 * 1000,
        "endTime": current_time,
        "grouped": "true",
    }

    encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

    ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
    metric_truststore_ca_certs = 'ca.pem'
    ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs)
    metric_collector_https_enabled = str(
        configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"

    try:
        conn = network.get_http_connection(
            collector_host,
            int(collector_port),
            metric_collector_https_enabled,
            ca_certs,
            ssl_version=AmbariConfig.get_resolved_config(
            ).get_force_https_protocol_value())
        conn.request("GET",
                     AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception:
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])

    if response.status != 200:
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])

    data_json = json.loads(data)
    metrics = []
    # will get large standard deviation for multiple hosts,
    # if host1 reports small local values, but host2 reports large local values
    for metrics_data in data_json["metrics"]:
        metrics += metrics_data["metrics"].values()
    pass

    if not metrics or len(metrics) < 2:
        number_of_data_points = len(metrics) if metrics else 0
        return (RESULT_STATE_SKIPPED, [
            "There are not enough data points to calculate the standard deviation ({0} sampled)"
            .format(number_of_data_points)
        ])

    minimum_value_multiplier = 1
    if 'dfs.FSNamesystem.CapacityUsed' in metric_name:
        minimum_value_multiplier = 1024 * 1024  # MB to bytes
    elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name:
        minimum_value_multiplier = 1000  # seconds to millis

    if minimum_value_threshold:
        # Filter out points below min threshold
        metrics = [
            metric for metric in metrics
            if metric > (minimum_value_threshold * minimum_value_multiplier)
        ]
        if len(metrics) < 2:
            return (RESULT_STATE_OK, [
                'There were no data points above the minimum threshold of {0} seconds'
                .format(minimum_value_threshold)
            ])

    mean_value = mean(metrics)
    stddev = sample_standard_deviation(metrics)

    try:
        deviation_percent = stddev / float(mean_value) * 100
    except ZeroDivisionError:
        # should not be a case for this alert
        return (RESULT_STATE_SKIPPED, [
            "Unable to calculate the standard deviation because the mean value is 0"
        ])

    # log the AMS request
    if logger.isEnabledFor(logging.DEBUG):
        logger.debug("""
    AMS request parameters - {0}
    AMS response - {1}
    Mean - {2}
    Standard deviation - {3}
    Percentage standard deviation - {4}
    """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev,
               deviation_percent))

    mean_value_localized = locale.format("%.0f", mean_value, grouping=True)

    variance_value = (deviation_percent / 100.0) * mean_value
    variance_value_localized = locale.format("%.0f",
                                             variance_value,
                                             grouping=True)

    # check for CRITICAL status
    if deviation_percent > critical_threshold:
        threshold_value = ((critical_threshold / 100.0) * mean_value)
        threshold_value_localized = locale.format("%.0f",
                                                  threshold_value,
                                                  grouping=True)

        message = DEVIATION_THRESHOLD_MESSAGE.format(
            variance_value_localized, metric_units, deviation_percent,
            mean_value_localized, metric_units, threshold_value_localized,
            metric_units)

        return (RESULT_STATE_CRITICAL, [message])

    # check for WARNING status
    if deviation_percent > warning_threshold:
        threshold_value = ((warning_threshold / 100.0) * mean_value)
        threshold_value_localized = locale.format("%.0f",
                                                  threshold_value,
                                                  grouping=True)

        message = DEVIATION_THRESHOLD_MESSAGE.format(
            variance_value_localized, metric_units, deviation_percent,
            mean_value_localized, metric_units, threshold_value_localized,
            metric_units)

        return (RESULT_STATE_WARNING, [message])

    # return OK status; use the warning threshold as the value to compare against
    threshold_value = ((warning_threshold / 100.0) * mean_value)
    threshold_value_localized = locale.format("%.0f",
                                              threshold_value,
                                              grouping=True)

    message = DEVIATION_OK_MESSAGE.format(variance_value_localized,
                                          metric_units, warning_threshold,
                                          mean_value_localized, metric_units,
                                          threshold_value_localized,
                                          metric_units)

    return (RESULT_STATE_OK, [message])
Exemplo n.º 11
0
 def __init__(self):
     self.stop_event = threading.Event()
     self.config = AmbariConfig.get_resolved_config()
     self.init()