def falcon(type, action = None, upgrade_type=None): import params if action == 'config': env = Environment.get_instance() # These 2 parameters are used in ../templates/client.properties.j2 env.config.params["falcon_host"] = params.falcon_host env.config.params["falcon_port"] = params.falcon_port File(os.path.join(params.falcon_conf_dir, 'falcon-env.sh'), content = InlineTemplate(params.falcon_env_sh_template)) PropertiesFile(os.path.join(params.falcon_conf_dir, 'runtime.properties'), properties = params.falcon_runtime_properties) PropertiesFile(os.path.join(params.falcon_conf_dir, 'startup.properties'), properties = params.falcon_startup_properties) PropertiesFile(os.path.join(params.falcon_conf_dir, 'client.properties'), properties = params.falcon_client_properties) if type == 'server': ServiceConfig(params.falcon_win_service_name, action = "change_user", username = params.falcon_user, password = Script.get_password(params.falcon_user)) if action == 'start': Service(params.falcon_win_service_name, action = "start") if action == 'stop': Service(params.falcon_win_service_name, action = "stop")
def format(self, format_string, *args, **kwargs): env = Environment.get_instance() variables = kwargs params = env.config.params result = checked_unite(variables, params) return self.vformat(format_string, args, result)
def format(self, format_string, *args, **kwargs): variables = kwargs if Environment.has_instance(): env = Environment.get_instance() params = env.config.params # don't use checked_unite for this as it would interfere with reload(module) # for things like params and status_params; instead, start out copying # the environment parameters and add in any locally declared variables to # override existing env parameters all_params = params.copy() else: all_params = {} all_params.update(variables) self.convert_field = self.convert_field_protected result_protected = self.vformat(format_string, args, all_params) self.convert_field = self.convert_field_unprotected result_unprotected = self.vformat(format_string, args, all_params) if result_protected != result_unprotected: Logger.sensitive_strings[result_unprotected] = result_protected return result_unprotected
def falcon(type, action = None): import params if action == 'config': env = Environment.get_instance() # These 2 parameters are used in ../templates/client.properties.j2 env.config.params["falcon_host"] = params.falcon_host env.config.params["falcon_port"] = params.falcon_port File(os.path.join(params.falcon_conf_dir, 'falcon-env.sh'), content = InlineTemplate(params.falcon_env_sh_template)) File(os.path.join(params.falcon_conf_dir, 'client.properties'), content = Template('client.properties.j2')) PropertiesFile(os.path.join(params.falcon_conf_dir, 'runtime.properties'), properties = params.falcon_runtime_properties) PropertiesFile(os.path.join(params.falcon_conf_dir, 'startup.properties'), properties = params.falcon_startup_properties) if type == 'server': ServiceConfig(params.falcon_win_service_name, action = "change_user", username = params.falcon_user, password = Script.get_password(params.falcon_user)) if action == 'start': Service(params.falcon_win_service_name, action = "start") if action == 'stop': Service(params.falcon_win_service_name, action = "stop")
def get_check_command(oozie_url, host_name, configurations, parameters): kerberos_env = None user = USER_DEFAULT if USER_KEY in configurations: user = configurations[USER_KEY] security_enabled = False if SECURITY_ENABLED in configurations: security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE' if security_enabled: # defaults user_keytab = USER_KEYTAB_DEFAULT user_principal = USER_PRINCIPAL_DEFAULT # check script params if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters: user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if USER_PRINCIPAL_KEY in configurations: user_principal = configurations[USER_PRINCIPAL_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_KEY in configurations: user_keytab = configurations[USER_KEYTAB_KEY] # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] klist_path_local = get_klist_path(kerberos_executable_search_paths) kinit_path_local = get_kinit_path(kerberos_executable_search_paths) # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated kinit_command = "{0} -s {1} || ".format(klist_path_local, ccache_file) + format("{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; ") Execute(kinit_command, environment=kerberos_env, user=user) # oozie configuration directory uses a symlink when > HDP 2.2 oozie_config_directory = OOZIE_CONF_DIR_LEGACY if os.path.exists(OOZIE_CONF_DIR): oozie_config_directory = OOZIE_CONF_DIR command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, user)
def __new__(cls, name, env=None, provider=None, **kwargs): if isinstance(name, list): while len(name) != 1: cls(name.pop(0), env, provider, **kwargs) name = name[0] env = env or Environment.get_instance() provider = provider or getattr(cls, 'provider', None) r_type = cls.__name__ if r_type not in env.resources: env.resources[r_type] = {} if name not in env.resources[r_type]: obj = super(Resource, cls).__new__(cls) env.resources[r_type][name] = obj env.resource_list.append(obj) return obj obj = env.resources[r_type][name] if obj.provider != provider: raise Fail( "Duplicate resource %r with a different provider %r != %r" % (obj, provider, obj.provider)) obj.override(**kwargs) return obj
def action_delayed_for_nameservice(self, nameservice, action_name, main_resource): resource = {} env = Environment.get_instance() env_dict_key = 'hdfs_files_sudo' if main_resource.create_as_root else 'hdfs_files' if main_resource.create_as_root: Logger.info("Will create {0} as root user".format( main_resource.resource.target)) if not env_dict_key in env.config: env.config[env_dict_key] = [] # Put values in dictionary-resource for field_name, json_field_name in RESOURCE_TO_JSON_FIELDS.iteritems(): if field_name == 'action': resource[json_field_name] = action_name elif field_name == 'mode' and main_resource.resource.mode: resource[json_field_name] = oct( main_resource.resource.mode)[1:] elif field_name == 'manage_if_exists': resource[json_field_name] = main_resource.manage_if_exists elif getattr(main_resource.resource, field_name): resource[json_field_name] = getattr(main_resource.resource, field_name) resource['nameservice'] = nameservice # Add resource to create env.config[env_dict_key].append(resource)
def get_check_command(oozie_url, host_name, configurations): if OOZIE_USER in configurations: oozie_user = configurations[OOZIE_USER] else: raise Exception("Oozie user is required") security_enabled = False if SECURITY_ENABLED in configurations: security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE' kerberos_env = None if security_enabled: if OOZIE_KEYTAB in configurations and OOZIE_PRINCIPAL in configurations: oozie_keytab = configurations[OOZIE_KEYTAB] oozie_principal = configurations[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: raise KerberosPropertiesNotFound('The Oozie keytab and principal are required configurations when security is enabled.') # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None klist_path_local = get_klist_path(kerberos_executable_search_paths) klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command, user=oozie_user) if return_code != 0: kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") # kinit Execute(kinit_command, environment=kerberos_env, user=oozie_user, ) # oozie configuration directory uses a symlink when > HDP 2.2 oozie_config_directory = OOZIE_CONF_DIR_LEGACY if os.path.exists(OOZIE_CONF_DIR): oozie_config_directory = OOZIE_CONF_DIR command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, oozie_user)
def get_check_command(oozie_url, host_name, configurations): security_enabled = False if SECURITY_ENABLED in configurations: security_enabled = str( configurations[SECURITY_ENABLED]).upper() == 'TRUE' kerberos_env = None if security_enabled: if OOZIE_KEYTAB in configurations and OOZIE_PRINCIPAL in configurations: oozie_keytab = configurations[OOZIE_KEYTAB] oozie_principal = configurations[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: raise KerberosPropertiesNotFound( 'The Oozie keytab and principal are required configurations when security is enabled.' ) # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format( env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None klist_path_local = get_klist_path(kerberos_executable_search_paths) klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_command = format( "{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; " ) # kinit Execute(kinit_command, environment=kerberos_env) # oozie configuration directory uses a symlink when > HDP 2.2 oozie_config_directory = OOZIE_CONF_DIR_LEGACY if os.path.exists(OOZIE_CONF_DIR): oozie_config_directory = OOZIE_CONF_DIR command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env)
def configure(self, env): import params env.set_params(params) if params.monitor_security_enabled and self.component == 'monitor': import os import random import string basedir = Environment.get_instance().config.basedir keystore_file = os.path.join(basedir, "files", "keystore.jks") truststore_file = os.path.join(basedir, "files", "cacerts.jks") cert_file = os.path.join(basedir, "files", "server.cer") if os.path.exists(keystore_file) or os.path.exists( truststore_file) or os.path.exists(cert_file): self.fail_with_error( "trying to create monitor certs but they already existed") goodchars = string.lowercase + string.uppercase + string.digits + '#%+,-./:=?@^_' keypass = ''.join(random.choice(goodchars) for x in range(20)) storepass = ''.join(random.choice(goodchars) for x in range(20)) https_params = {} https_params[params.keystore_property] = params.keystore_path https_params[params.truststore_property] = params.truststore_path https_params[params.keystore_password_property] = keypass https_params[params.truststore_password_property] = storepass setup_conf_dir(name=self.component, extra_params=https_params) Execute(format( "{java64_home}/bin/keytool -genkey -alias \"default\" -keyalg RSA -keypass {keypass} -storepass {storepass} -keystore {keystore_file} -dname \"CN=Unknown, OU=Unknown, O=Unknown, L=Unknown, ST=Unknown, C=Unknown\"" ), user=params.accumulo_user) Execute(format( "{java64_home}/bin/keytool -export -alias \"default\" -storepass {storepass} -file {cert_file} -keystore {keystore_file}" ), user=params.accumulo_user) Execute(format( "echo \"yes\" | {java64_home}/bin/keytool -import -v -trustcacerts -alias \"default\" -file {cert_file} -keystore {truststore_file} -keypass {keypass} -storepass {storepass}" ), user=params.accumulo_user) accumulo_StaticFile("keystore.jks") accumulo_StaticFile("cacerts.jks") else: setup_conf_dir(name=self.component)
def __get_delegation_token(self, user, keytab, principal, kinit_path): """ Gets the kerberos delegation token from name node """ import params url = params.namenode_path + "/webhdfs/v1/?op=GETDELEGATIONTOKEN" Logger.info("Getting delegation token from {0}".format(url)) response, _, _ = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, "get_delegation_token", kinit_path, False, "Delegation Token", user) json_response = json.loads(response) if json_response['Token'] and json_response['Token']['urlString']: return json_response['Token']['urlString'] error_msg = "Get Token: Unable to get kerberos delegation token from webhdfs: \nurl = {0}, user = {1}, keytab = {2}, principal = {3}, kinit-path = {4} \nresponse = {5}".format(url, user, keytab, principal, kinit_path, json_response) Logger.error(error_msg) self.checks_failed += 1
def format(self, format_string, *args, **kwargs): env = Environment.get_instance() variables = kwargs params = env.config.params all_params = checked_unite(variables, params) self.convert_field = self.convert_field_protected result_protected = self.vformat(format_string, args, all_params) self.convert_field = self.convert_field_unprotected result_unprotected = self.vformat(format_string, args, all_params) if result_protected != result_unprotected: Logger.sensitive_strings[result_unprotected] = result_protected return result_unprotected
def action_delayed(self, action_name, main_resource): resource = {} env = Environment.get_instance() if not 'hdfs_files' in env.config: env.config['hdfs_files'] = [] # Put values in dictionary-resource for field_name, json_field_name in RESOURCE_TO_JSON_FIELDS.iteritems(): if field_name == 'action': resource[json_field_name] = action_name elif field_name == 'mode' and main_resource.resource.mode: resource[json_field_name] = oct(main_resource.resource.mode)[1:] elif getattr(main_resource.resource, field_name): resource[json_field_name] = getattr(main_resource.resource, field_name) # Add resource to create env.config['hdfs_files'].append(resource)
def action_execute(self, main_resource): env = Environment.get_instance() # Check required parameters if main_resource.has_core_configs: main_resource.assert_parameter_is_set('user') if not 'hdfs_files' in env.config or not env.config['hdfs_files']: Logger.info( "No resources to create. 'create_on_execute' or 'delete_on_execute' or 'download_on_execute' wasn't triggered before this 'execute' action." ) return hadoop_bin_dir = main_resource.resource.hadoop_bin_dir hadoop_conf_dir = main_resource.resource.hadoop_conf_dir user = main_resource.resource.user if main_resource.has_core_configs else None security_enabled = main_resource.resource.security_enabled keytab_file = main_resource.resource.keytab kinit_path = main_resource.resource.kinit_path_local logoutput = main_resource.resource.logoutput principal_name = main_resource.resource.principal_name jar_path = JAR_PATH timestamp = time.time() json_path = format(JSON_PATH) if security_enabled: main_resource.kinit() # Write json file to disk File(json_path, owner=user, content=json.dumps(env.config['hdfs_files'])) # Execute jar to create/delete resources in hadoop Execute( format( "hadoop --config {hadoop_conf_dir} jar {jar_path} {json_path}" ), user=user, path=[hadoop_bin_dir], logoutput=logoutput, ) # Clean env.config['hdfs_files'] = []
def action_execute(self, main_resource, sudo=False): env = Environment.get_instance() env_dict_key = 'hdfs_files_sudo' if sudo else 'hdfs_files' if not env_dict_key in env.config or not env.config[env_dict_key]: return # Check required parameters if not sudo: main_resource.assert_parameter_is_set('user') user = main_resource.resource.user else: user = None hadoop_bin_dir = main_resource.resource.hadoop_bin_dir hadoop_conf_dir = main_resource.resource.hadoop_conf_dir security_enabled = main_resource.resource.security_enabled keytab_file = main_resource.resource.keytab kinit_path = main_resource.resource.kinit_path_local logoutput = main_resource.resource.logoutput principal_name = main_resource.resource.principal_name jar_path = JAR_PATH timestamp = time.time() json_path = format(JSON_PATH) if security_enabled: main_resource.kinit() # Write json file to disk File(json_path, owner=user, content=json.dumps(env.config[env_dict_key])) # Execute jar to create/delete resources in hadoop Execute( ('hadoop', '--config', hadoop_conf_dir, 'jar', jar_path, json_path), user=user, path=[hadoop_bin_dir], logoutput=logoutput, sudo=sudo, ) # Clean env.config[env_dict_key] = []
def _get_delegation_token(namenode_address, user, keytab, principal, kinit_path): """ Gets the kerberos delegation token from name node """ url = namenode_address + "/webhdfs/v1/?op=GETDELEGATIONTOKEN" logger.info("Getting delegation token from {0} for PXF".format(url)) response, _, _ = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, "get_delegation_token", kinit_path, False, "Delegation Token", user) json_response = json.loads(response) if json_response['Token'] and json_response['Token']['urlString']: return json_response['Token']['urlString'] msg = "Unable to get delegation token for PXF" logger.error(msg) raise Exception(msg)
def __new__(cls, name, env=None, provider=None, **kwargs): if isinstance(name, list): while len(name) != 1: cls(name.pop(0), env, provider, **kwargs) name = name[0] env = env or Environment.get_instance() provider = provider or getattr(cls, 'provider', None) r_type = cls.__name__ if r_type not in env.resources: env.resources[r_type] = {} obj = super(Resource, cls).__new__(cls) env.resources[r_type][name] = obj env.resource_list.append(obj) return obj
def call_curl_request(self,user,keytab,principal, url, flag_http_response, request_method='GET',request_body='',header=''): """ :param user: service user for which call is to be made :param keytab: keytab of service user :param principal: principal of service user :param url: url with which call is to be made :param flag_http_response: flag to get only response-code or response string :param request_method: http method (GET / POST / PUT / DELETE) :param request_body: data to be send along with the request :param header: http header required for the call :return: Returns the response error_msg , time_millis """ response = None error_msg = None time_millis = 0 response, error_msg, time_millis = curl_krb_request(Environment.get_instance().tmp_dir, keytab, principal, url, 'ranger_admin_calls', None, flag_http_response, "Ranger-Admin API calls", user,kinit_timer_ms=0,method = request_method,body=request_body,header=header) return response, error_msg, time_millis
def configure(self, env): import params env.set_params(params) if params.monitor_security_enabled and self.component == 'monitor': import os import random import string basedir = Environment.get_instance().config.basedir keystore_file = os.path.join(basedir, "files", "keystore.jks") truststore_file = os.path.join(basedir, "files", "cacerts.jks") cert_file = os.path.join(basedir, "files", "server.cer") if os.path.exists(keystore_file) or os.path.exists(truststore_file) or os.path.exists(cert_file): self.fail_with_error("trying to create monitor certs but they already existed") goodchars = string.lowercase + string.uppercase + string.digits + '#%+,-./:=?@^_' keypass = ''.join(random.choice(goodchars) for x in range(20)) storepass = ''.join(random.choice(goodchars) for x in range(20)) https_params = {} https_params[params.keystore_property] = params.keystore_path https_params[params.truststore_property] = params.truststore_path https_params[params.keystore_password_property] = keypass https_params[params.truststore_password_property] = storepass setup_conf_dir(name=self.component, extra_params=https_params) Execute( format("{java64_home}/bin/keytool -genkey -alias \"default\" -keyalg RSA -keypass {keypass} -storepass {storepass} -keystore {keystore_file} -dname \"CN=Unknown, OU=Unknown, O=Unknown, L=Unknown, ST=Unknown, C=Unknown\""), user=params.accumulo_user) Execute( format("{java64_home}/bin/keytool -export -alias \"default\" -storepass {storepass} -file {cert_file} -keystore {keystore_file}"), user=params.accumulo_user) Execute( format("echo \"yes\" | {java64_home}/bin/keytool -import -v -trustcacerts -alias \"default\" -file {cert_file} -keystore {truststore_file} -keypass {keypass} -storepass {storepass}"), user=params.accumulo_user) accumulo_StaticFile("keystore.jks") accumulo_StaticFile("cacerts.jks") else: setup_conf_dir(name=self.component)
def action_execute(self, main_resource): env = Environment.get_instance() # Check required parameters main_resource.assert_parameter_is_set('user') if not 'hdfs_files' in env.config or not env.config['hdfs_files']: Logger.info("No resources to create. 'create_on_execute' or 'delete_on_execute' wasn't triggered before this 'execute' action.") return hadoop_bin_dir = main_resource.resource.hadoop_bin_dir hadoop_conf_dir = main_resource.resource.hadoop_conf_dir user = main_resource.resource.user security_enabled = main_resource.resource.security_enabled keytab_file = main_resource.resource.keytab kinit_path = main_resource.resource.kinit_path_local logoutput = main_resource.resource.logoutput principal_name = main_resource.resource.principal_name jar_path=JAR_PATH timestamp = time.time() json_path=format(JSON_PATH) if security_enabled: main_resource.kinit() # Write json file to disk File(json_path, owner = user, content = json.dumps(env.config['hdfs_files']) ) # Execute jar to create/delete resources in hadoop Execute(format("hadoop --config {hadoop_conf_dir} jar {jar_path} {json_path}"), user=user, path=[hadoop_bin_dir], logoutput=logoutput, ) # Clean env.config['hdfs_files'] = []
def __init__(self, name, env=None, provider=None, **kwargs): if isinstance(name, list): name = name.pop(0) if hasattr(self, 'name'): return self.env = env or Environment.get_instance() self.name = name self.provider = provider or getattr(self, 'provider', None) self.arguments = {} for key, value in kwargs.items(): try: arg = self._arguments[key] except KeyError: raise Fail("%s received unsupported argument %s" % (self, key)) else: try: self.arguments[key] = arg.validate(value) except InvalidArgument, exc: raise InvalidArgument("%s %s" % (self, exc))
def get_check_command(oozie_url, host_name, parameters): security_enabled = False if SECURITY_ENABLED in parameters: security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' kerberos_env = None if security_enabled: if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: oozie_keytab = parameters[OOZIE_KEYTAB] oozie_principal = parameters[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.') # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} klist_path_local = get_klist_path() klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path() kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") # kinit Execute(kinit_command, environment=kerberos_env) command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") return (command, kerberos_env)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True name_service = configurations[NAMESERVICE_KEY] hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, ['Unable to find unique namenode alias key {0}'.format(nn_unique_ids_key)]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" active_namenodes = [] standby_namenodes = [] unknown_namenodes = [] # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service,nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) elif state == HDFS_NN_STATE_STANDBY: standby_namenodes.append(value) else: unknown_namenodes.append(value) except: unknown_namenodes.append(value) # now that the request is done, determine if this host is the host that # should report the status of the HA topology is_active_namenode = False for active_namenode in active_namenodes: if active_namenode.startswith(host_name): is_active_namenode = True # there's only one scenario here; there is exactly 1 active and 1 standby is_topology_healthy = len(active_namenodes) == 1 and len(standby_namenodes) == 1 result_label = 'Active{0}, Standby{1}, Unknown{2}'.format(str(active_namenodes), str(standby_namenodes), str(unknown_namenodes)) # Healthy Topology: # - Active NN reports the alert, standby does not # # Unhealthy Topology: # - Report the alert if this is the first named host # - Report the alert if not the first named host, but the other host # could not report its status if is_topology_healthy: if is_active_namenode is True: return (RESULT_STATE_OK, [result_label]) else: return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) else: # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( name_service, nn_unique_ids[0]) first_listed_host = '' if first_listed_host_key in hdfs_site: first_listed_host = hdfs_site[first_listed_host_key] is_first_listed_host = False if first_listed_host.startswith(host_name): is_first_listed_host = True if is_first_listed_host: return (RESULT_STATE_CRITICAL, [result_label]) else: # not the first listed host, but the first host might be in the unknown return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower( ) == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] if SMOKEUSER_SCRIPT_PARAM_KEY in parameters: smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format( host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: if WEBHCAT_KEYTAB_KEY not in configurations or WEBHCAT_PRINCIPAL_KEY not in configurations: return (RESULT_CODE_UNKNOWN, [str(configurations)]) try: webhcat_keytab = configurations[WEBHCAT_KEYTAB_KEY] webhcat_principal = configurations[WEBHCAT_PRINCIPAL_KEY] # substitute _HOST in kerberos principal with actual fqdn webhcat_principal = webhcat_principal.replace('_HOST', host_name) # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}webhcat_alert_cc_{2}".format( env.tmp_dir, sep, getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None klist_path_local = get_klist_path(kerberos_executable_search_paths) klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path( kerberos_executable_search_paths) kinit_command = format( "{kinit_path_local} -l 5m -c {ccache_file} -kt {webhcat_keytab} {webhcat_principal}; " ) # kinit so that curl will work with --negotiate Execute(kinit_command) # make a single curl call to get just the http code curl = subprocess.Popen([ 'curl', '--negotiate', '-u', ':', '-sL', '-w', '%{http_code}', '--connect-timeout', curl_connection_timeout, '-o', '/dev/null', query_url ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() if stderr != '': raise Exception(stderr) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content start_time = time.time() curl = subprocess.Popen([ 'curl', '--negotiate', '-u', ':', '-sL', '--connect-timeout', curl_connection_timeout, query_url, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() total_time = time.time() - start_time if stderr != '': raise Exception(stderr) json_response = json.loads(stdout) except Exception, exception: return (RESULT_CODE_CRITICAL, [str(exception)])
def __init__(self, env=None): self.env = env or Environment.get_instance()
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(stdout) except Exception, exception: return (RESULT_CODE_CRITICAL, [str(exception)])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format( scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint") last_checkpoint_time_response_json = json.loads( last_checkpoint_time_response) last_checkpoint_time = int( last_checkpoint_time_response_json["beans"][0] ["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", None, False, "NameNode Last Checkpoint") journal_transaction_info_response_json = json.loads( journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json[ "beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int( journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int( journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) if (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_critical)): result_code = 'CRITICAL' elif (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning)): result_code = 'WARNING' except Exception, e: label = str(e) result_code = 'UNKNOWN'
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", None, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) return (RESULT_CODE_CRITICAL, [label])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status?user.name={2}".format(host_name, webhcat_port, smokeuser) # initialize total_time = 0 json_response = {} if security_enabled: try: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] env = Environment.get_instance() stdout, stderr, time_millis = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, True, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content stdout, stderr, total_time = curl_krb_request(env.tmp_dir, smokeuser_keytab, smokeuser_principal, query_url, "webhcat_alert_cc_", kerberos_executable_search_paths, False, "WebHCat Server Status", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(stdout) except: return (RESULT_CODE_CRITICAL, [traceback.format_exc()]) else: url_response = None try: # execute the query for the JSON that includes WebHCat status start_time = time.time() url_response = urllib2.urlopen(query_url, timeout=connection_timeout) total_time = time.time() - start_time json_response = json.loads(url_response.read()) except urllib2.HTTPError as httpError: label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) except: label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label]) finally: if url_response is not None: try: url_response.close() except: pass # if status is not in the response, we can't do any check; return CRIT if 'status' not in json_response: return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)]) # URL response received, parse it try: webhcat_status = json_response['status'] except: return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()]) # proper JSON received, compare against known value if webhcat_status.lower() == WEBHCAT_OK_RESPONSE: result_code = RESULT_CODE_OK label = OK_MESSAGE.format(total_time, query_url) else: result_code = RESULT_CODE_CRITICAL label = CRITICAL_WEBHCAT_STATUS_MESSAGE.format(webhcat_status) return (result_code, [label])
def get_check_command(oozie_url, host_name, configurations, parameters, only_kinit): kerberos_env = None user = USER_DEFAULT if USER_KEY in configurations: user = configurations[USER_KEY] if is_security_enabled(configurations): # defaults user_keytab = USER_KEYTAB_DEFAULT user_principal = USER_PRINCIPAL_DEFAULT # check script params if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters: user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if USER_PRINCIPAL_KEY in configurations: user_principal = configurations[USER_PRINCIPAL_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_KEY in configurations: user_keytab = configurations[USER_KEYTAB_KEY] # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format( env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] klist_path_local = get_klist_path(kerberos_executable_search_paths) kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_part_command = format( "{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; " ) # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated if only_kinit: kinit_command = kinit_part_command else: kinit_command = "{0} -s {1} || ".format( klist_path_local, ccache_file) + kinit_part_command # prevent concurrent kinit kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: Execute(kinit_command, environment=kerberos_env, user=user) finally: kinit_lock.release() # Configure stack root stack_root = STACK_ROOT_DEFAULT if STACK_NAME_KEY in configurations and STACK_ROOT_KEY in configurations: stack_root = stack_tools.get_stack_root( configurations[STACK_NAME_KEY], configurations[STACK_ROOT_KEY]).lower() # oozie configuration directory using a symlink oozie_config_directory = OOZIE_CONF_DIR.replace(STACK_ROOT_PATTERN, stack_root) if not os.path.exists(oozie_config_directory): oozie_config_directory = OOZIE_CONF_DIR_LEGACY command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, user)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None http_policy = 'HTTP_ONLY' # hdfs-site is required if not HDFS_SITE_KEY in configurations: return 'SKIPPED', [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL hdfs_site = configurations[HDFS_SITE_KEY] scheme = "https" if http_policy == "HTTPS_ONLY" else "http" nn_addresses = get_all_namenode_addresses(hdfs_site) for nn_address in nn_addresses: if nn_address.startswith(host_name + ":") or nn_address == host_name: uri = nn_address break if not uri: return 'SKIPPED', [ 'NameNode on host {0} not found (namenode adresses = {1})'.format( host_name, ', '.join(nn_addresses)) ] upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False, "HDFS Upgrade Finalized State", smokeuser, kinit_timer_ms=kinit_timer_ms) upgrade_finalized_response_json = json.loads( last_checkpoint_time_response) upgrade_finalized = bool(upgrade_finalized_response_json["beans"] [0]["UpgradeFinalized"]) else: upgrade_finalized = bool( get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized")) if upgrade_finalized: label = "HDFS cluster is not in the upgrade state" result_code = 'OK' else: label = "HDFS cluster is not finalized" result_code = 'CRITICAL' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def __init__(self, name): self.env = Environment.get_instance() self.name = name
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return ("UNKNOWN", ["There were no configurations supplied to the script."]) uri = None scheme = "http" http_uri = None https_uri = None http_policy = "HTTP_ONLY" checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == "TRUE" kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace("_HOST", host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) * 100 percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) * 100 # determine the right URI and whether to use SSL uri = http_uri if http_policy == "HTTPS_ONLY": scheme = "https" if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme, uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, ) last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response) last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry, "checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, ) journal_transaction_info_response_json = json.loads(journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int( get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout) ) journal_transaction_info = get_value_from_jmx( journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout ) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict["LastAppliedOrWrittenTxId"]) most_recent_tx = int(journal_transaction_info_dict["MostRecentCheckpointTxId"]) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time) / 1000 label = LABEL.format(h=get_time(delta)["h"], m=get_time(delta)["m"], tx=transaction_difference) if (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_critical) ): result_code = "CRITICAL" elif (transaction_difference > int(checkpoint_tx)) and ( float(delta) / int(checkpoint_period) * 100 >= int(percent_warning) ): result_code = "WARNING" except Exception, e: label = str(e) result_code = "UNKNOWN"
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, False, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout) try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"]) except ValueError, error: convert_to_json_failed = True logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}". format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", executable_paths, True, "NodeManager Health Summary", smokeuser, connection_timeout=curl_connection_timeout) else:
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri label = '' url_response = None node_healthy = 'false' total_time = 0 # some yarn-site structures don't have the web ui address if uri is None: if host_name is None: host_name = socket.getfqdn() uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT) if OSCheck.is_windows_family(): uri_host, uri_port = uri.split(':') # on windows 0.0.0.0 is invalid address to connect but on linux it resolved to 127.0.0.1 uri_host = resolve_address(uri_host) uri = '{0}:{1}'.format(uri_host, uri_port) query = "{0}://{1}/ws/v1/node/info".format(scheme,uri) try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) url_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, query, "nm_health_alert", executable_paths, False, "NodeManager Health", smokeuser, connection_timeout=curl_connection_timeout) json_response = json.loads(url_response) else: # execute the query for the JSON that includes templeton status url_response = urllib2.urlopen(query, timeout=connection_timeout) json_response = json.loads(url_response.read()) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError), traceback.format_exc()) return (RESULT_CODE_CRITICAL, [label])
def get_check_command(oozie_url, host_name, configurations, parameters): kerberos_env = None smokeuser = SMOKEUSER_DEFAULT if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] security_enabled = False if SECURITY_ENABLED in configurations: security_enabled = str(configurations[SECURITY_ENABLED]).upper() == 'TRUE' if security_enabled: # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] klist_path_local = get_klist_path(kerberos_executable_search_paths) klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command, user=smokeuser) if return_code != 0: kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_command = format("{kinit_path_local} -l 5m -kt {smokeuser_keytab} {smokeuser_principal}; ") # kinit Execute(kinit_command, environment=kerberos_env, user=smokeuser) # oozie configuration directory uses a symlink when > HDP 2.2 oozie_config_directory = OOZIE_CONF_DIR_LEGACY if os.path.exists(OOZIE_CONF_DIR): oozie_config_directory = OOZIE_CONF_DIR command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, smokeuser)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False, "HDFS Upgrade Finalized State", smokeuser ) upgrade_finalized_response_json = json.loads(last_checkpoint_time_response) upgrade_finalized = bool(upgrade_finalized_response_json["beans"][0]["UpgradeFinalized"]) else: upgrade_finalized = bool(get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized")) if upgrade_finalized: label = "HDFS cluster is not in the upgrade state" result_code = 'OK' else: label = "HDFS cluster is not finalized" result_code = 'CRITICAL' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations : a mapping of configuration key to value parameters : a mapping of script parameter key to value host_name : the name of this host where the alert is running :type configurations dict :type parameters dict :type host_name str """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri upgrade_finalized_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format( scheme, uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() last_checkpoint_time_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, upgrade_finalized_qry, "upgrade_finalized_state", executable_paths, False, "HDFS Upgrade Finalized State", smokeuser) upgrade_finalized_response_json = json.loads( last_checkpoint_time_response) upgrade_finalized = bool(upgrade_finalized_response_json["beans"] [0]["UpgradeFinalized"]) else: upgrade_finalized = bool( get_value_from_jmx(upgrade_finalized_qry, "UpgradeFinalized")) if upgrade_finalized: label = "HDFS cluster is not in the upgrade state" result_code = 'OK' else: label = "HDFS cluster is not finalized" result_code = 'CRITICAL' except Exception, e: label = str(e) result_code = 'UNKNOWN'
def get_check_command(oozie_url, host_name, configurations, parameters, only_kinit): kerberos_env = None user = USER_DEFAULT if USER_KEY in configurations: user = configurations[USER_KEY] if is_security_enabled(configurations): # defaults user_keytab = USER_KEYTAB_DEFAULT user_principal = USER_PRINCIPAL_DEFAULT # check script params if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters: user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if USER_PRINCIPAL_KEY in configurations: user_principal = configurations[USER_PRINCIPAL_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_KEY in configurations: user_keytab = configurations[USER_KEYTAB_KEY] # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] klist_path_local = get_klist_path(kerberos_executable_search_paths) kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_part_command = format("{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; ") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated if only_kinit: kinit_command = kinit_part_command else: kinit_command = "{0} -s {1} || ".format(klist_path_local, ccache_file) + kinit_part_command Execute(kinit_command, environment=kerberos_env, user=user) # oozie configuration directory uses a symlink when > HDP 2.2 oozie_config_directory = OOZIE_CONF_DIR_LEGACY if os.path.exists(OOZIE_CONF_DIR): oozie_config_directory = OOZIE_CONF_DIR command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, user)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ result_code = RESULT_CODE_UNKNOWN if configurations is None: return (result_code, ['There were no configurations supplied to the script.']) webhcat_port = WEBHCAT_PORT_DEFAULT if TEMPLETON_PORT_KEY in configurations: webhcat_port = int(configurations[TEMPLETON_PORT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = configurations[SECURITY_ENABLED_KEY].lower() == 'true' # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT curl_connection_timeout = CURL_CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) curl_connection_timeout = str(int(connection_timeout)) # the alert will always run on the webhcat host if host_name is None: host_name = socket.getfqdn() # webhcat always uses http, never SSL query_url = "http://{0}:{1}/templeton/v1/status".format(host_name, webhcat_port) # initialize total_time = 0 json_response = {} if security_enabled: if WEBHCAT_KEYTAB_KEY not in configurations or WEBHCAT_PRINCIPAL_KEY not in configurations: return (RESULT_CODE_UNKNOWN, [str(configurations)]) try: webhcat_keytab = configurations[WEBHCAT_KEYTAB_KEY] webhcat_principal = configurations[WEBHCAT_PRINCIPAL_KEY] # substitute _HOST in kerberos principal with actual fqdn webhcat_principal = webhcat_principal.replace('_HOST', host_name) # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}webhcat_alert_cc_{2}".format(env.tmp_dir, sep, getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None klist_path_local = get_klist_path(kerberos_executable_search_paths) klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_command = format("{kinit_path_local} -l 5m -c {ccache_file} -kt {webhcat_keytab} {webhcat_principal}; ") # kinit so that curl will work with --negotiate Execute(kinit_command) # make a single curl call to get just the http code curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL', '-w', '%{http_code}', '--connect-timeout', curl_connection_timeout, '-o', '/dev/null', query_url], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() if stderr != '': raise Exception(stderr) # check the response code response_code = int(stdout) # 0 indicates no connection if response_code == 0: label = CRITICAL_CONNECTION_MESSAGE.format(query_url) return (RESULT_CODE_CRITICAL, [label]) # any other response aside from 200 is a problem if response_code != 200: label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url) return (RESULT_CODE_CRITICAL, [label]) # now that we have the http status and it was 200, get the content start_time = time.time() curl = subprocess.Popen(['curl', '--negotiate', '-u', ':', '-sL', '--connect-timeout', curl_connection_timeout, query_url, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=kerberos_env) stdout, stderr = curl.communicate() total_time = time.time() - start_time if stderr != '': raise Exception(stderr) json_response = json.loads(stdout) except Exception, exception: return (RESULT_CODE_CRITICAL, [str(exception)])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.']) # if not in HA mode, then SKIP if not NAMESERVICE_KEY in configurations: return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled']) # hdfs-site is required if not HDFS_SITE_KEY in configurations: return (RESULT_STATE_UNKNOWN, [ '{0} is a required parameter for the script'.format(HDFS_SITE_KEY) ]) if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # determine whether or not SSL is enabled is_ssl_enabled = False if DFS_POLICY_KEY in configurations: dfs_policy = configurations[DFS_POLICY_KEY] if dfs_policy == "HTTPS_ONLY": is_ssl_enabled = True name_service = configurations[NAMESERVICE_KEY] hdfs_site = configurations[HDFS_SITE_KEY] # look for dfs.ha.namenodes.foo nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service if not nn_unique_ids_key in hdfs_site: return (RESULT_STATE_UNKNOWN, [ 'Unable to find unique namenode alias key {0}'.format( nn_unique_ids_key) ]) namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}' jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*" if is_ssl_enabled: namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}' jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*" active_namenodes = [] standby_namenodes = [] unknown_namenodes = [] # now we have something like 'nn1,nn2,nn3,nn4' # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id] # ie dfs.namenode.http-address.hacluster.nn1 nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',') for nn_unique_id in nn_unique_ids: key = namenode_http_fragment.format(name_service, nn_unique_id) if key in hdfs_site: # use str() to ensure that unicode strings do not have the u' in them value = str(hdfs_site[key]) try: jmx_uri = jmx_uri_fragment.format(value) if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) state_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, jmx_uri, "ha_nn_health", None, False, "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout) state = _get_ha_state_from_json(state_response) else: state_response = get_jmx(jmx_uri, connection_timeout) state = _get_ha_state_from_json(state_response) if state == HDFS_NN_STATE_ACTIVE: active_namenodes.append(value) elif state == HDFS_NN_STATE_STANDBY: standby_namenodes.append(value) else: unknown_namenodes.append(value) except: unknown_namenodes.append(value) # now that the request is done, determine if this host is the host that # should report the status of the HA topology is_active_namenode = False for active_namenode in active_namenodes: if active_namenode.startswith(host_name): is_active_namenode = True # there's only one scenario here; there is exactly 1 active and 1 standby is_topology_healthy = len(active_namenodes) == 1 and len( standby_namenodes) == 1 result_label = 'Active{0}, Standby{1}, Unknown{2}'.format( str(active_namenodes), str(standby_namenodes), str(unknown_namenodes)) # Healthy Topology: # - Active NN reports the alert, standby does not # # Unhealthy Topology: # - Report the alert if this is the first named host # - Report the alert if not the first named host, but the other host # could not report its status if is_topology_healthy: if is_active_namenode is True: return (RESULT_STATE_OK, [result_label]) else: return (RESULT_STATE_SKIPPED, ['Another host will report this alert']) else: # dfs.namenode.rpc-address.service.alias is guaranteed in HA mode first_listed_host_key = 'dfs.namenode.rpc-address.{0}.{1}'.format( name_service, nn_unique_ids[0]) first_listed_host = '' if first_listed_host_key in hdfs_site: first_listed_host = hdfs_site[first_listed_host_key] is_first_listed_host = False if first_listed_host.startswith(host_name): is_first_listed_host = True if is_first_listed_host: return (RESULT_STATE_CRITICAL, [result_label]) else: # not the first listed host, but the first host might be in the unknown return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
def execute(configs={}, parameters={}, host_name=None): if configs is None: return 'UNKNOWN', [ 'There were no configurations supplied to the script.' ] if host_name is None: host_name = socket.getfqdn() env = Environment.get_instance() solr_user = configs[SMOKEUSER_KEY] ui_ssl_enabled = False ui_ssl_enabled_key = UI_SSL_ENABLED_KEY_DEFAULT security_enabled = False if SECURITY_ENABLED_KEY in configs: security_enabled = str(configs[SECURITY_ENABLED_KEY]).upper() == 'TRUE' # check parameters if UI_SSL_ENABLED_KEY in parameters: ui_ssl_enabled_key = parameters[UI_SSL_ENABLED_KEY] if ui_ssl_enabled_key in configs: ui_ssl_enabled = str(configs[UI_SSL_ENABLED_KEY]).upper() == 'TRUE' solr_port = SOLR_PORT_DEFAULT if SOLR_PORT_KEY in parameters: solr_port = parameters[SOLR_PORT_KEY] connection_timeout = SOLR_CONNECTION_TIMEOUT_DEFAULT if SOLR_CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = parameters[SOLR_CONNECTION_TIMEOUT_KEY] if security_enabled: try: security_auth(configs, host_name, solr_user) except Exception as e: return RESULT_CODE_CRITICAL, ["kinit error: " + str(e)] if ui_ssl_enabled: scheme = "https" else: scheme = "http" state_file = "{}/solrstatus.json".format(env.tmp_dir) cmd = "curl -s -m {} -o {} --negotiate -u: -k '{}://{}:{}/solr/admin/collections?action=clusterstatus&wt=json'".format( connection_timeout, state_file, scheme, host_name, solr_port) try: Execute(cmd, tries=2, try_sleep=3, user=solr_user, logoutput=True) except: return (RESULT_CODE_CRITICAL, ["curl cannot reach Solr, solr seems to be down"]) try: state = json.load(open(state_file)) except: return (RESULT_CODE_CRITICAL, ["Get status failed, could not load state file"]) os.remove(state_file) cluster_state = state['cluster']['collections'] outdata = dict() outdata['shards'] = list() outdata['replicas'] = list() for key in cluster_state: for shard, shard_data in cluster_state[key]['shards'].iteritems(): for replica, replica_data in shard_data['replicas'].iteritems(): if replica_data['state'] != 'active': rname = '-'.join([key, shard, replica]) outdata['replicas'].append({rname: replica_data['state']}) else: pass if shard_data['state'] != 'active': sname = '-'.join([key, shard]) outdata['shards'].append({sname: shard_data['state']}) else: pass if outdata['shards'] or outdata['replicas']: return (RESULT_CODE_CRITICAL, [ "Replicas or Shards found not active. %s" % json.dumps(outdata) ]) else: return (RESULT_CODE_OK, ["All Shards and replicas healthy"])
def execute(parameters=None, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: parameters (dictionary): a mapping of parameter key to value host_name (string): the name of this host where the alert is running """ if parameters is None: return (RESULT_CODE_UNKNOWN, ['There were no parameters supplied to the script.']) if not OOZIE_URL_KEY in parameters: return (RESULT_CODE_UNKNOWN, ['The Oozie URL is a required parameter.']) # use localhost on Windows, 0.0.0.0 on others; 0.0.0.0 means bind to all # interfaces, which doesn't work on Windows localhost_address = 'localhost' if OSCheck.get_os_family() == OSConst.WINSRV_FAMILY else '0.0.0.0' oozie_url = parameters[OOZIE_URL_KEY] oozie_url = oozie_url.replace(urlparse(oozie_url).hostname,localhost_address) security_enabled = False if SECURITY_ENABLED in parameters: security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") try: # kinit if security is enabled so that oozie-env.sh can make the web request kerberos_env = None if security_enabled: if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: oozie_keytab = parameters[OOZIE_KEYTAB] oozie_principal = parameters[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: return (RESULT_CODE_UNKNOWN, ['The Oozie keytab and principal are required parameters when security is enabled.']) # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, sep, getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} klist_path_local = get_klist_path() klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path() kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") # kinit Execute(kinit_command, environment=kerberos_env) # execute the command Execute(command, environment=kerberos_env) return (RESULT_CODE_OK, ["Successful connection to {0}".format(oozie_url)]) except Exception, ex: return (RESULT_CODE_CRITICAL, [str(ex)])
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) uri = None scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' checkpoint_tx = CHECKPOINT_TX_DEFAULT checkpoint_period = CHECKPOINT_PERIOD_DEFAULT if NN_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NN_HTTP_ADDRESS_KEY] if NN_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NN_HTTPS_ADDRESS_KEY] if NN_HTTP_POLICY_KEY in configurations: http_policy = configurations[NN_HTTP_POLICY_KEY] if NN_CHECKPOINT_TX_KEY in configurations: checkpoint_tx = configurations[NN_CHECKPOINT_TX_KEY] if NN_CHECKPOINT_PERIOD_KEY in configurations: checkpoint_period = configurations[NN_CHECKPOINT_PERIOD_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] executable_paths = None if EXECUTABLE_SEARCH_PATHS in configurations: executable_paths = configurations[EXECUTABLE_SEARCH_PATHS] security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) percent_warning = PERCENT_WARNING_DEFAULT if PERCENT_WARNING_KEY in parameters: percent_warning = float(parameters[PERCENT_WARNING_KEY]) percent_critical = PERCENT_CRITICAL_DEFAULT if PERCENT_CRITICAL_KEY in parameters: percent_critical = float(parameters[PERCENT_CRITICAL_KEY]) checkpoint_txn_multiplier_warning = CHECKPOINT_TX_MULTIPLIER_WARNING_DEFAULT if CHECKPOINT_TX_MULTIPLIER_WARNING_KEY in parameters: checkpoint_txn_multiplier_warning = float(parameters[CHECKPOINT_TX_MULTIPLIER_WARNING_KEY]) checkpoint_txn_multiplier_critical = CHECKPOINT_TX_MULTIPLIER_CRITICAL_DEFAULT if CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY in parameters: checkpoint_txn_multiplier_critical = float(parameters[CHECKPOINT_TX_MULTIPLIER_CRITICAL_KEY]) kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri current_time = int(round(time.time() * 1000)) last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri) journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri) # start out assuming an OK status label = None result_code = "OK" try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() # curl requires an integer timeout curl_connection_timeout = int(connection_timeout) last_checkpoint_time_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, last_checkpoint_time_qry,"checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) last_checkpoint_time_response_json = json.loads(last_checkpoint_time_response) last_checkpoint_time = int(last_checkpoint_time_response_json["beans"][0]["LastCheckpointTime"]) journal_transaction_info_response, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal, journal_transaction_info_qry,"checkpoint_time_alert", executable_paths, False, "NameNode Last Checkpoint", smokeuser, connection_timeout=curl_connection_timeout, kinit_timer_ms = kinit_timer_ms) journal_transaction_info_response_json = json.loads(journal_transaction_info_response) journal_transaction_info = journal_transaction_info_response_json["beans"][0]["JournalTransactionInfo"] else: last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry, "LastCheckpointTime", connection_timeout)) journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry, "JournalTransactionInfo", connection_timeout) journal_transaction_info_dict = json.loads(journal_transaction_info) last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId']) most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId']) transaction_difference = last_tx - most_recent_tx delta = (current_time - last_checkpoint_time)/1000 label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference) is_checkpoint_txn_warning = transaction_difference > checkpoint_txn_multiplier_warning * int(checkpoint_tx) is_checkpoint_txn_critical = transaction_difference > checkpoint_txn_multiplier_critical * int(checkpoint_tx) # Either too many uncommitted transactions or missed check-pointing for # long time decided by the thresholds if is_checkpoint_txn_critical or (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)): logger.debug('Raising critical alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx)) result_code = 'CRITICAL' elif is_checkpoint_txn_warning or (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)): logger.debug('Raising warning alert: transaction_difference = {0}, checkpoint_tx = {1}'.format(transaction_difference, checkpoint_tx)) result_code = 'WARNING' except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) scheme = 'http' http_uri = None https_uri = None http_policy = 'HTTP_ONLY' security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' kerberos_keytab = None if KERBEROS_KEYTAB in configurations: kerberos_keytab = configurations[KERBEROS_KEYTAB] kerberos_principal = None if KERBEROS_PRINCIPAL in configurations: kerberos_principal = configurations[KERBEROS_PRINCIPAL] kerberos_principal = kerberos_principal.replace('_HOST', host_name) if NODEMANAGER_HTTP_ADDRESS_KEY in configurations: http_uri = configurations[NODEMANAGER_HTTP_ADDRESS_KEY] if NODEMANAGER_HTTPS_ADDRESS_KEY in configurations: https_uri = configurations[NODEMANAGER_HTTPS_ADDRESS_KEY] if YARN_HTTP_POLICY_KEY in configurations: http_policy = configurations[YARN_HTTP_POLICY_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] # parse script arguments connection_timeout = CONNECTION_TIMEOUT_DEFAULT if CONNECTION_TIMEOUT_KEY in parameters: connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY]) # determine the right URI and whether to use SSL uri = http_uri if http_policy == 'HTTPS_ONLY': scheme = 'https' if https_uri is not None: uri = https_uri uri = str(host_name) + ":" + uri.split(":")[1] live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format( scheme, uri) convert_to_json_failed = False response_code = None try: if kerberos_principal is not None and kerberos_keytab is not None and security_enabled: env = Environment.get_instance() url_response, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, False, "NodeManager Health Summary", smokeuser) try: url_response_json = json.loads(url_response) live_nodemanagers = json.loads( url_response_json["beans"][0]["LiveNodeManagers"]) except ValueError, error: convert_to_json_failed = True if logger.isEnabledFor(logging.DEBUG): logger.exception( "[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}" .format("NodeManager Health Summary", str(error))) if convert_to_json_failed: response_code, error_msg, time_millis = curl_krb_request( env.tmp_dir, kerberos_keytab, kerberos_principal, live_nodemanagers_qry, "nm_health_summary_alert", None, True, "NodeManager Health Summary", smokeuser) else: