def perform_grafana_put_call(url, id, payload, server):
  response = None
  data = None
  userAndPass = b64encode('{0}:{1}'.format(server.user, server.password))
  headers = {"Content-Type": "application/json",
             'Authorization' : 'Basic %s' %  userAndPass }
  grafana_https_enabled = server.protocol.lower() == 'https'

  ca_certs = None
  if grafana_https_enabled:
    import params
    ca_certs = params.ams_grafana_cert_file

  for i in xrange(0, GRAFANA_CONNECT_TRIES):
    try:
      conn = network.get_http_connection(server.host, int(server.port), grafana_https_enabled, ca_certs)
      conn.request("PUT", url + "/" + str(id), payload, headers)
      response = conn.getresponse()
      data = response.read()
      Logger.info("Http data: %s" % data)
      conn.close()
      break
    except (httplib.HTTPException, socket.error) as ex:
      if i < GRAFANA_CONNECT_TRIES - 1:
        time.sleep(GRAFANA_CONNECT_TIMEOUT)
        Logger.info("Connection to Grafana failed. Next retry in %s seconds."
                    % (GRAFANA_CONNECT_TIMEOUT))
        continue
      else:
        raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex))
      pass

  return (response, data)
def perform_grafana_get_call(url, server):
  grafana_https_enabled = server.protocol.lower() == 'https'
  response = None
  ca_certs = None
  if grafana_https_enabled:
    import params
    ca_certs = params.ams_grafana_cert_file

  for i in xrange(0, GRAFANA_CONNECT_TRIES):
    try:
      conn = network.get_http_connection(server.host,
                                         int(server.port),
                                         grafana_https_enabled, ca_certs)

      userAndPass = b64encode('{0}:{1}'.format(server.user, server.password))
      headers = { 'Authorization' : 'Basic %s' %  userAndPass }

      Logger.info("Connecting (GET) to %s:%s%s" % (server.host, server.port, url))

      conn.request("GET", url, headers = headers)
      response = conn.getresponse()
      Logger.info("Http response: %s %s" % (response.status, response.reason))
      break
    except (httplib.HTTPException, socket.error) as ex:
      if i < GRAFANA_CONNECT_TRIES - 1:
        time.sleep(GRAFANA_CONNECT_TIMEOUT)
        Logger.info("Connection to Grafana failed. Next retry in %s seconds."
                    % (GRAFANA_CONNECT_TIMEOUT))
        continue
      else:
        raise Fail("Ambari Metrics Grafana update failed due to: %s" % str(ex))
      pass

  return response
def perform_grafana_post_call(url, payload, server):
    import params

    response = None
    data = None
    userAndPass = b64encode('{0}:{1}'.format(server.user, server.password))
    Logger.debug('POST payload: %s' % payload)
    headers = {
        "Content-Type": "application/json",
        "Content-Length": len(payload),
        'Authorization': 'Basic %s' % userAndPass
    }
    grafana_https_enabled = server.protocol.lower() == 'https'

    ca_certs = None
    if grafana_https_enabled:
        ca_certs = params.ams_grafana_ca_cert

    for i in xrange(0, params.grafana_connect_attempts):
        try:
            Logger.info("Connecting (POST) to %s:%s%s" %
                        (server.host, server.port, url))
            conn = network.get_http_connection(
                server.host,
                int(server.port),
                grafana_https_enabled,
                ca_certs,
                ssl_version=Script.get_force_https_protocol_value())

            conn.request("POST", url, payload, headers)

            response = conn.getresponse()
            Logger.info("Http response: %s %s" %
                        (response.status, response.reason))
            if response.status == 401:  #Intermittent error thrown from Grafana
                if i < params.grafana_connect_attempts - 1:
                    Logger.info(
                        "Connection to Grafana failed. Next retry in %s seconds."
                        % (params.grafana_connect_retry_delay))
                    time.sleep(params.grafana_connect_retry_delay)
                    continue
            data = response.read()
            Logger.info("Http data: %s" % data)
            conn.close()
            break
        except (httplib.HTTPException, socket.error) as ex:
            if i < params.grafana_connect_attempts - 1:
                Logger.info(
                    "Connection to Grafana failed. Next retry in %s seconds." %
                    (params.grafana_connect_retry_delay))
                time.sleep(params.grafana_connect_retry_delay)
                continue
            else:
                raise Fail("Ambari Metrics Grafana update failed due to: %s" %
                           str(ex))
            pass

    return (response, data)
Пример #4
0
def post_metrics_to_collector(ams_metrics_post_url,
                              metric_collector_host,
                              metric_collector_port,
                              metric_collector_https_enabled,
                              metric_json,
                              headers,
                              ca_certs,
                              tries=1,
                              connect_timeout=10):
    for i in xrange(0, tries):
        try:
            Logger.info("Generated metrics for host %s :\n%s" %
                        (metric_collector_host, metric_json))

            Logger.info("Connecting (POST) to %s:%s%s" %
                        (metric_collector_host, metric_collector_port,
                         ams_metrics_post_url))
            conn = network.get_http_connection(
                metric_collector_host,
                int(metric_collector_port),
                metric_collector_https_enabled,
                ca_certs,
                ssl_version=Script.get_force_https_protocol_value())
            conn.request("POST", ams_metrics_post_url, metric_json, headers)

            response = conn.getresponse()
            Logger.info(
                "Http response for host %s: %s %s" %
                (metric_collector_host, response.status, response.reason))
        except (httplib.HTTPException, socket.error) as ex:
            if i < tries - 1:  #range/xrange returns items from start to end-1
                time.sleep(connect_timeout)
                Logger.info(
                    "Connection failed for host %s. Next retry in %s seconds."
                    % (metric_collector_host, connect_timeout))
                continue
            else:
                raise Fail("Metrics were not saved. Connection failed.")

        data = response.read()
        Logger.info("Http data: %s" % data)
        conn.close()

        if response.status == 200:
            Logger.info("Metrics were saved.")
            break
        else:
            Logger.info("Metrics were not saved.")
            if i < tries - 1:  #range/xrange returns items from start to end-1
                time.sleep(tries)
                Logger.info("Next retry in %s seconds." % (tries))
            else:
                raise Fail(
                    "Metrics were not saved. POST request status: %s %s \n%s" %
                    (response.status, response.reason, data))
Пример #5
0
def perform_grafana_post_call(url, payload, server):
    response = None
    data = None
    userAndPass = b64encode('{0}:{1}'.format(server.user, server.password))
    Logger.debug('POST payload: %s' % payload)
    headers = {
        "Content-Type": "application/json",
        "Content-Length": len(payload),
        'Authorization': 'Basic %s' % userAndPass
    }
    grafana_https_enabled = server.protocol.lower() == 'https'

    for i in xrange(0, GRAFANA_CONNECT_TRIES):
        try:
            Logger.info("Connecting (POST) to %s:%s%s" %
                        (server.host, server.port, url))
            conn = network.get_http_connection(server.host, int(server.port),
                                               grafana_https_enabled)

            conn.request("POST", url, payload, headers)

            response = conn.getresponse()
            Logger.info("Http response: %s %s" %
                        (response.status, response.reason))
            if response.status == 401:  #Intermittent error thrown from Grafana
                if i < GRAFANA_CONNECT_TRIES - 1:
                    time.sleep(GRAFANA_CONNECT_TIMEOUT)
                    Logger.info(
                        "Connection to Grafana failed. Next retry in %s seconds."
                        % (GRAFANA_CONNECT_TIMEOUT))
                    continue
            data = response.read()
            Logger.info("Http data: %s" % data)
            conn.close()
            break
        except (httplib.HTTPException, socket.error) as ex:
            if i < GRAFANA_CONNECT_TRIES - 1:
                time.sleep(GRAFANA_CONNECT_TIMEOUT)
                Logger.info(
                    "Connection to Grafana failed. Next retry in %s seconds." %
                    (GRAFANA_CONNECT_TIMEOUT))
                continue
            else:
                raise Fail("Ambari Metrics Grafana update failed due to: %s" %
                           str(ex))
            pass

    return (response, data)
def perform_grafana_delete_call(url, server):
    import params

    grafana_https_enabled = server.protocol.lower() == 'https'
    response = None

    ca_certs = None
    if grafana_https_enabled:
        ca_certs = params.ams_grafana_ca_cert

    for i in xrange(0, params.grafana_connect_attempts):
        try:
            conn = network.get_http_connection(
                server.host,
                int(server.port),
                grafana_https_enabled,
                ca_certs,
                ssl_version=Script.get_force_https_protocol_value())

            userAndPass = b64encode('{0}:{1}'.format(server.user,
                                                     server.password))
            headers = {'Authorization': 'Basic %s' % userAndPass}

            Logger.info("Connecting (DELETE) to %s:%s%s" %
                        (server.host, server.port, url))

            conn.request("DELETE", url, headers=headers)
            response = conn.getresponse()
            Logger.info("Http response: %s %s" %
                        (response.status, response.reason))
            break
        except (httplib.HTTPException, socket.error) as ex:
            if i < params.grafana_connect_attempts - 1:
                Logger.info(
                    "Connection to Grafana failed. Next retry in %s seconds." %
                    (params.grafana_connect_retry_delay))
                time.sleep(params.grafana_connect_retry_delay)
                continue
            else:
                raise Fail("Ambari Metrics Grafana update failed due to: %s" %
                           str(ex))
            pass

    return response
Пример #7
0
    def _load_metric(self, ams_collector_host, ams_metric, host_filter):
        get_metrics_parameters = {
            "metricNames": ams_metric,
            "appId": self.ams_app_id,
            "hostname": host_filter,
            "precision": "seconds",
            "grouped": "true",
        }
        encoded_get_metrics_parameters = urllib.urlencode(
            get_metrics_parameters)
        url = AMS_METRICS_GET_URL % encoded_get_metrics_parameters

        _ssl_version = AmbariConfig.get_resolved_config(
        ).get_force_https_protocol_value()

        ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
        metric_truststore_ca_certs = 'ca.pem'
        ca_certs = os.path.join(ams_monitor_conf_dir,
                                metric_truststore_ca_certs)

        conn = None
        response = None
        data = None
        try:
            conn = network.get_http_connection(ams_collector_host,
                                               int(self.ams_collector_port),
                                               self.use_ssl,
                                               ca_certs,
                                               ssl_version=_ssl_version)
            conn.request("GET", url)
            response = conn.getresponse()
            data = response.read()
        except Exception, exception:
            if logger.isEnabledFor(logging.DEBUG):
                logger.exception(
                    "[Alert][{0}] Unable to retrieve metrics from AMS: {1}".
                    format(self.alert_id, str(exception)))
            status = response.status if response else None
            return None, status
Пример #8
0
    def service_check_for_single_host(self, metric_collector_host, params):
        random_value1 = random.random()
        headers = {"Content-type": "application/json"}
        ca_certs = os.path.join(params.ams_monitor_conf_dir,
                                params.metric_truststore_ca_certs)

        current_time = int(time.time()) * 1000
        metric_json = Template('smoketest_metrics.json.j2',
                               hostname=params.hostname,
                               random1=random_value1,
                               current_time=current_time).get_content()
        try:
            post_metrics_to_collector(
                self.AMS_METRICS_POST_URL, metric_collector_host,
                params.metric_collector_port,
                params.metric_collector_https_enabled, metric_json, headers,
                ca_certs, self.AMS_CONNECT_TRIES, self.AMS_CONNECT_TIMEOUT)

            get_metrics_parameters = {
                "metricNames": "AMBARI_METRICS.SmokeTest.FakeMetric",
                "appId": "amssmoketestfake",
                "hostname": params.hostname,
                "startTime": current_time - 60000,
                "endTime": current_time + 61000,
                "precision": "seconds",
                "grouped": "false",
            }
            encoded_get_metrics_parameters = urllib.urlencode(
                get_metrics_parameters)

            Logger.info(
                "Connecting (GET) to %s:%s%s" %
                (metric_collector_host, params.metric_collector_port,
                 self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters))
            for i in xrange(0, self.AMS_READ_TRIES):
                conn = network.get_http_connection(
                    metric_collector_host,
                    int(params.metric_collector_port),
                    params.metric_collector_https_enabled,
                    ca_certs,
                    ssl_version=Script.get_force_https_protocol_value())
                conn.request(
                    "GET",
                    self.AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
                response = conn.getresponse()
                Logger.info(
                    "Http response for host %s : %s %s" %
                    (metric_collector_host, response.status, response.reason))

                data = response.read()
                Logger.info("Http data: %s" % data)
                conn.close()

                if response.status == 200:
                    Logger.info("Metrics were retrieved from host %s" %
                                metric_collector_host)
                else:
                    raise Fail(
                        "Metrics were not retrieved from host %s. GET request status: %s %s \n%s"
                        % (metric_collector_host, response.status,
                           response.reason, data))
                data_json = json.loads(data)

                def floats_eq(f1, f2, delta):
                    return abs(f1 - f2) < delta

                values_are_present = False
                for metrics_data in data_json["metrics"]:
                    if (str(current_time) in metrics_data["metrics"] and
                            str(current_time + 1000) in metrics_data["metrics"]
                            and floats_eq(
                                metrics_data["metrics"][str(current_time)],
                                random_value1, 0.0000001) and floats_eq(
                                    metrics_data["metrics"][str(current_time +
                                                                1000)],
                                    current_time, 1)):
                        Logger.info(
                            "Values %s and %s were found in the response from host %s."
                            % (metric_collector_host, random_value1,
                               current_time))
                        values_are_present = True
                        break
                        pass

                if not values_are_present:
                    if i < self.AMS_READ_TRIES - 1:  #range/xrange returns items from start to end-1
                        Logger.info(
                            "Values weren't stored yet. Retrying in %s seconds."
                            % (self.AMS_READ_TIMEOUT))
                        time.sleep(self.AMS_READ_TIMEOUT)
                    else:
                        raise Fail(
                            "Values %s and %s were not found in the response."
                            % (random_value1, current_time))
                else:
                    break
                    pass
        except Fail as ex:
            Logger.warning(
                "Ambari Metrics service check failed on collector host %s. Reason : %s"
                % (metric_collector_host, str(ex)))
            raise Fail(
                "Ambari Metrics service check failed on collector host %s. Reason : %s"
                % (metric_collector_host, str(ex)))
Пример #9
0
def execute(configurations={}, parameters={}, host_name=None):
  """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
  hostnames = host_name
  current_time = int(time.time()) * 1000

  # parse script arguments
  connection_timeout = CONNECTION_TIMEOUT_DEFAULT
  if CONNECTION_TIMEOUT_KEY in parameters:
    connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

  merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
  if MERGE_HA_METRICS_PARAM_KEY in parameters:
    merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true'

  metric_name = METRIC_NAME_PARAM_DEFAULT
  if METRIC_NAME_PARAM_KEY in parameters:
    metric_name = parameters[METRIC_NAME_PARAM_KEY]

  metric_units = METRIC_UNITS_DEFAULT
  if METRIC_UNITS_PARAM_KEY in parameters:
    metric_units = parameters[METRIC_UNITS_PARAM_KEY]

  app_id = APP_ID_PARAM_DEFAULT
  if APP_ID_PARAM_KEY in parameters:
    app_id = parameters[APP_ID_PARAM_KEY]

  interval = INTERVAL_PARAM_DEFAULT
  if INTERVAL_PARAM_KEY in parameters:
    interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

  warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
  if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
    warning_threshold = _coerce_to_integer(parameters[DEVIATION_WARNING_THRESHOLD_KEY])

  critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
  if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
    critical_threshold = _coerce_to_integer(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

  minimum_value_threshold = None
  if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
    minimum_value_threshold = _coerce_to_integer(parameters[MINIMUM_VALUE_THRESHOLD_KEY])

  #parse configuration
  if configurations is None:
    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])

  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

  if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
    collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
    collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
  else:
    # ams-site/timeline.metrics.service.webapp.address is required
    if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
      return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)])
    else:
      collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
      if valid_collector_webapp_address(collector_webapp_address):
        collector_host = select_metric_collector_for_sink(app_id.lower())
        collector_port = int(collector_webapp_address[1])
      else:
        return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(
          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])

  namenode_service_rpc_address = None
  # hdfs-site is required
  if not HDFS_SITE_KEY in configurations:
    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

  hdfs_site = configurations[HDFS_SITE_KEY]

  if 'dfs.namenode.servicerpc-address' in hdfs_site:
    namenode_service_rpc_address = hdfs_site['dfs.namenode.servicerpc-address']

  # if namenode alert and HA mode
  if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
      return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])

    if SMOKEUSER_KEY in configurations:
      smokeuser = configurations[SMOKEUSER_KEY]

    executable_paths = None
    if EXECUTABLE_SEARCH_PATHS in configurations:
      executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

    # parse script arguments
    security_enabled = False
    if SECURITY_ENABLED_KEY in configurations:
      security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

    kerberos_keytab = None
    if KERBEROS_KEYTAB in configurations:
      kerberos_keytab = configurations[KERBEROS_KEYTAB]

    kerberos_principal = None
    if KERBEROS_PRINCIPAL in configurations:
      kerberos_principal = configurations[KERBEROS_PRINCIPAL]
      kerberos_principal = kerberos_principal.replace('_HOST', host_name)

    # determine whether or not SSL is enabled
    is_ssl_enabled = False
    if DFS_POLICY_KEY in configurations:
      dfs_policy = configurations[DFS_POLICY_KEY]
      if dfs_policy == "HTTPS_ONLY":
        is_ssl_enabled = True

    kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)

    name_service = configurations[NAMESERVICE_KEY]

    # look for dfs.ha.namenodes.foo
    nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
    if not nn_unique_ids_key in hdfs_site:
      return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)])

    namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
    jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    if is_ssl_enabled:
      namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
      jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

    # now we have something like 'nn1,nn2,nn3,nn4'
    # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
    # ie dfs.namenode.http-address.hacluster.nn1
    namenodes = []
    active_namenodes = []
    nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
    for nn_unique_id in nn_unique_ids:
      key = namenode_http_fragment.format(name_service, nn_unique_id)

      if key in hdfs_site:
        # use str() to ensure that unicode strings do not have the u' in them
        value = str(hdfs_site[key])
        namenode = str(hdfs_site[key]).split(":")[0]

        namenodes.append(namenode)
        try:
          jmx_uri = jmx_uri_fragment.format(value)
          if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
            env = Environment.get_instance()

            # curl requires an integer timeout
            curl_connection_timeout = int(connection_timeout)
            state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir,
              kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False,
              "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout,
              kinit_timer_ms = kinit_timer_ms)

            state = _get_ha_state_from_json(state_response)
          else:
            state_response = get_jmx(jmx_uri, connection_timeout)
            state = _get_ha_state_from_json(state_response)

          if state == HDFS_NN_STATE_ACTIVE:
            active_namenodes.append(namenode)

            # Only check active NN
            nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(name_service, nn_unique_id)
            if nn_service_rpc_address_key in hdfs_site:
              namenode_service_rpc_address = hdfs_site[nn_service_rpc_address_key]
          pass
        except:
          logger.exception("Unable to determine the active NameNode")
    pass

    if merge_ha_metrics:
      hostnames = ",".join(namenodes)
      # run only on active NN, no need to run the same requests from the standby
      if host_name not in active_namenodes:
        return (RESULT_STATE_SKIPPED, ['This alert will be reported by another host.'])
    pass

  # Skip service rpc alert if port is not enabled
  if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
    return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

  get_metrics_parameters = {
    "metricNames": metric_name,
    "appId": app_id,
    "hostname": hostnames,
    "startTime": current_time - interval * 60 * 1000,
    "endTime": current_time,
    "grouped": "true",
    }

  encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

  ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
  metric_truststore_ca_certs='ca.pem'
  ca_certs = os.path.join(ams_monitor_conf_dir,
                          metric_truststore_ca_certs)
  metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"

  try:
    conn = network.get_http_connection(collector_host, int(collector_port), metric_collector_https_enabled, ca_certs)
    conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
    response = conn.getresponse()
    data = response.read()
    conn.close()
  except Exception:
    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])

  if response.status != 200:
    return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from the Ambari Metrics service."])

  data_json = json.loads(data)
  metrics = []
  # will get large standard deviation for multiple hosts,
  # if host1 reports small local values, but host2 reports large local values
  for metrics_data in data_json["metrics"]:
    metrics += metrics_data["metrics"].values()
  pass

  if not metrics or len(metrics) < 2:
    number_of_data_points = len(metrics) if metrics else 0
    return (RESULT_STATE_SKIPPED, ["There are not enough data points to calculate the standard deviation ({0} sampled)".format(
      number_of_data_points)])

  minimum_value_multiplier = 1
  if 'dfs.FSNamesystem.CapacityUsed' in metric_name:
    minimum_value_multiplier = 1024 * 1024  # MB to bytes
  elif 'rpc.rpc.datanode' in metric_name or 'rpc.rpc.client' in metric_name:
    minimum_value_multiplier = 1000  # seconds to millis

  if minimum_value_threshold:
    # Filter out points below min threshold
    metrics = [metric for metric in metrics if metric > (minimum_value_threshold * minimum_value_multiplier)]
    if len(metrics) < 2:
      return (RESULT_STATE_OK, ['There were no data points above the minimum threshold of {0} seconds'.format(minimum_value_threshold)])

  mean_value = mean(metrics)
  stddev = sample_standard_deviation(metrics)

  try:
    deviation_percent = stddev / float(mean_value) * 100
  except ZeroDivisionError:
    # should not be a case for this alert
    return (RESULT_STATE_SKIPPED, ["Unable to calculate the standard deviation because the mean value is 0"])

  # log the AMS request
  if logger.isEnabledFor(logging.DEBUG):
    logger.debug("""
    AMS request parameters - {0}
    AMS response - {1}
    Mean - {2}
    Standard deviation - {3}
    Percentage standard deviation - {4}
    """.format(encoded_get_metrics_parameters, data_json, mean_value, stddev, deviation_percent))

  mean_value_localized = locale.format("%.0f", mean_value, grouping=True)

  variance_value = (deviation_percent / 100.0) * mean_value
  variance_value_localized = locale.format("%.0f", variance_value, grouping=True)

  # check for CRITICAL status
  if deviation_percent > critical_threshold:
    threshold_value = ((critical_threshold / 100.0) * mean_value)
    threshold_value_localized = locale.format("%.0f", threshold_value, grouping=True)

    message = DEVIATION_THRESHOLD_MESSAGE.format(variance_value_localized, metric_units, deviation_percent,
      mean_value_localized, metric_units, threshold_value_localized, metric_units)

    return (RESULT_STATE_CRITICAL,[message])

  # check for WARNING status
  if deviation_percent > warning_threshold:
    threshold_value = ((warning_threshold / 100.0) * mean_value)
    threshold_value_localized = locale.format("%.0f", threshold_value, grouping = True)

    message = DEVIATION_THRESHOLD_MESSAGE.format(variance_value_localized, metric_units, deviation_percent,
      mean_value_localized, metric_units, threshold_value_localized, metric_units)

    return (RESULT_STATE_WARNING, [message])

  # return OK status; use the warning threshold as the value to compare against
  threshold_value = ((warning_threshold / 100.0) * mean_value)
  threshold_value_localized = locale.format("%.0f", threshold_value, grouping = True)

  message = DEVIATION_OK_MESSAGE.format(variance_value_localized, metric_units, warning_threshold,
    mean_value_localized, metric_units, threshold_value_localized, metric_units)

  return (RESULT_STATE_OK,[message])
def execute(configurations={}, parameters={}, host_name=None):
    """
  Returns a tuple containing the result code and a pre-formatted result label

  Keyword arguments:
  configurations : a mapping of configuration key to value
  parameters : a mapping of script parameter key to value
  host_name : the name of this host where the alert is running

  :type configurations dict
  :type parameters dict
  :type host_name str
  """
    hostnames = host_name
    current_time = int(time.time()) * 1000

    # parse script arguments
    connection_timeout = CONNECTION_TIMEOUT_DEFAULT
    if CONNECTION_TIMEOUT_KEY in parameters:
        connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])

    merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
    if MERGE_HA_METRICS_PARAM_KEY in parameters:
        merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower(
        ) == 'true'

    metric_name = METRIC_NAME_PARAM_DEFAULT
    if METRIC_NAME_PARAM_KEY in parameters:
        metric_name = parameters[METRIC_NAME_PARAM_KEY]

    metric_units = METRIC_UNITS_DEFAULT
    if METRIC_UNITS_PARAM_KEY in parameters:
        metric_units = parameters[METRIC_UNITS_PARAM_KEY]

    app_id = APP_ID_PARAM_DEFAULT
    if APP_ID_PARAM_KEY in parameters:
        app_id = parameters[APP_ID_PARAM_KEY]

    interval = INTERVAL_PARAM_DEFAULT
    if INTERVAL_PARAM_KEY in parameters:
        interval = _coerce_to_integer(parameters[INTERVAL_PARAM_KEY])

    warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
    if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
        warning_threshold = _coerce_to_integer(
            parameters[DEVIATION_WARNING_THRESHOLD_KEY])

    critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
    if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
        critical_threshold = _coerce_to_integer(
            parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])

    minimum_value_threshold = None
    if MINIMUM_VALUE_THRESHOLD_KEY in parameters:
        minimum_value_threshold = _coerce_to_integer(
            parameters[MINIMUM_VALUE_THRESHOLD_KEY])

    #parse configuration
    if configurations is None:
        return (RESULT_STATE_UNKNOWN,
                ['There were no configurations supplied to the script.'])

    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    if METRICS_COLLECTOR_VIP_HOST_KEY in configurations and METRICS_COLLECTOR_VIP_PORT_KEY in configurations:
        collector_host = configurations[METRICS_COLLECTOR_VIP_HOST_KEY]
        collector_port = int(configurations[METRICS_COLLECTOR_VIP_PORT_KEY])
    else:
        # ams-site/timeline.metrics.service.webapp.address is required
        if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
            ])
        else:
            collector_webapp_address = configurations[
                METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
            if valid_collector_webapp_address(collector_webapp_address):
                collector_host = select_metric_collector_for_sink(
                    app_id.lower())
                collector_port = int(collector_webapp_address[1])
            else:
                return (RESULT_STATE_UNKNOWN, [
                    '{0} value should be set as "fqdn_hostname:port", but set to {1}'
                    .format(
                        METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY,
                        configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])
                ])

    namenode_service_rpc_address = None
    # hdfs-site is required
    if not HDFS_SITE_KEY in configurations:
        return (RESULT_STATE_UNKNOWN, [
            '{0} is a required parameter for the script'.format(HDFS_SITE_KEY)
        ])

    hdfs_site = configurations[HDFS_SITE_KEY]

    if 'dfs.namenode.servicerpc-address' in hdfs_site:
        namenode_service_rpc_address = hdfs_site[
            'dfs.namenode.servicerpc-address']

    # if namenode alert and HA mode
    if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
        # hdfs-site is required
        if not HDFS_SITE_KEY in configurations:
            return (RESULT_STATE_UNKNOWN, [
                '{0} is a required parameter for the script'.format(
                    HDFS_SITE_KEY)
            ])

        if SMOKEUSER_KEY in configurations:
            smokeuser = configurations[SMOKEUSER_KEY]

        executable_paths = None
        if EXECUTABLE_SEARCH_PATHS in configurations:
            executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]

        # parse script arguments
        security_enabled = False
        if SECURITY_ENABLED_KEY in configurations:
            security_enabled = str(
                configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'

        kerberos_keytab = None
        if KERBEROS_KEYTAB in configurations:
            kerberos_keytab = configurations[KERBEROS_KEYTAB]

        kerberos_principal = None
        if KERBEROS_PRINCIPAL in configurations:
            kerberos_principal = configurations[KERBEROS_PRINCIPAL]
            kerberos_principal = kerberos_principal.replace('_HOST', host_name)

        # determine whether or not SSL is enabled
        is_ssl_enabled = False
        if DFS_POLICY_KEY in configurations:
            dfs_policy = configurations[DFS_POLICY_KEY]
            if dfs_policy == "HTTPS_ONLY":
                is_ssl_enabled = True

        kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER,
                                        DEFAULT_KERBEROS_KINIT_TIMER_MS)

        name_service = get_name_service_by_hostname(hdfs_site, host_name)

        # look for dfs.ha.namenodes.foo
        nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
        if not nn_unique_ids_key in hdfs_site:
            return (RESULT_STATE_UNKNOWN, [
                'Unable to find unique NameNode alias key {0}'.format(
                    nn_unique_ids_key)
            ])

        namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
        jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        if is_ssl_enabled:
            namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
            jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"

        # now we have something like 'nn1,nn2,nn3,nn4'
        # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
        # ie dfs.namenode.http-address.hacluster.nn1
        namenodes = []
        active_namenodes = []
        nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
        for nn_unique_id in nn_unique_ids:
            key = namenode_http_fragment.format(name_service, nn_unique_id)

            if key in hdfs_site:
                # use str() to ensure that unicode strings do not have the u' in them
                value = str(hdfs_site[key])
                namenode = str(hdfs_site[key]).split(":")[0]

                namenodes.append(namenode)
                try:
                    jmx_uri = jmx_uri_fragment.format(value)
                    if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
                        env = Environment.get_instance()

                        # curl requires an integer timeout
                        curl_connection_timeout = int(connection_timeout)
                        state_response, error_msg, time_millis = curl_krb_request(
                            env.tmp_dir,
                            kerberos_keytab,
                            kerberos_principal,
                            jmx_uri,
                            "ha_nn_health",
                            executable_paths,
                            False,
                            "NameNode High Availability Health",
                            smokeuser,
                            connection_timeout=curl_connection_timeout,
                            kinit_timer_ms=kinit_timer_ms)

                        state = _get_ha_state_from_json(state_response)
                    else:
                        state = _get_state_from_jmx(jmx_uri,
                                                    connection_timeout)

                    if state == HDFS_NN_STATE_ACTIVE:
                        active_namenodes.append(namenode)

                        # Only check active NN
                        nn_service_rpc_address_key = 'dfs.namenode.servicerpc-address.{0}.{1}'.format(
                            name_service, nn_unique_id)
                        if nn_service_rpc_address_key in hdfs_site:
                            namenode_service_rpc_address = hdfs_site[
                                nn_service_rpc_address_key]
                    pass
                except:
                    logger.exception("Unable to determine the active NameNode")
        pass

        if merge_ha_metrics:
            hostnames = ",".join(namenodes)
            # run only on active NN, no need to run the same requests from the standby
            if host_name not in active_namenodes:
                return (RESULT_STATE_SKIPPED,
                        ['This alert will be reported by another host.'])
        pass

    # Skip service rpc alert if port is not enabled
    if not namenode_service_rpc_address and 'rpc.rpc.datanode' in metric_name:
        return (RESULT_STATE_SKIPPED, ['Service RPC port is not enabled.'])

    get_metrics_parameters = {
        "metricNames": metric_name,
        "appId": app_id,
        "hostname": hostnames,
        "startTime": current_time - interval * 60 * 1000,
        "endTime": current_time,
        "grouped": "true",
    }

    encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)

    ams_monitor_conf_dir = "/etc/ambari-metrics-monitor/conf"
    metric_truststore_ca_certs = 'ca.pem'
    ca_certs = os.path.join(ams_monitor_conf_dir, metric_truststore_ca_certs)
    metric_collector_https_enabled = str(
        configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"

    _ssl_version = _get_ssl_version()
    try:
        conn = network.get_http_connection(collector_host,
                                           int(collector_port),
                                           metric_collector_https_enabled,
                                           ca_certs,
                                           ssl_version=_ssl_version)
        conn.request("GET",
                     AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
        response = conn.getresponse()
        data = response.read()
        conn.close()
    except Exception, e:
        logger.info(str(e))
        return (RESULT_STATE_UNKNOWN, [
            "Unable to retrieve metrics from the Ambari Metrics service."
        ])