def execute(configurations={}, parameters={}, host_name=None): """ Performs advanced disk checks under Linux. This will first attempt to check the HDP installation directories if they exist. If they do not exist, it will default to checking / Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) if not STACK_NAME in configurations or not STACK_ROOT in configurations: return (('STACK_ROOT', [ 'cluster-env/stack_name and cluster-env/stack_root are required' ])) path = stack_tools.get_stack_root(configurations[STACK_NAME], configurations[STACK_ROOT]) try: disk_usage = _get_disk_usage(path) result_code, label = _get_warnings_for_partition( parameters, disk_usage) except NotImplementedError, platform_error: return 'CRITICAL', [str(platform_error)]
def get_check_command(oozie_url, host_name, configurations, parameters, only_kinit): kerberos_env = None user = USER_DEFAULT if USER_KEY in configurations: user = configurations[USER_KEY] if is_security_enabled(configurations): # defaults user_keytab = USER_KEYTAB_DEFAULT user_principal = USER_PRINCIPAL_DEFAULT # check script params if USER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: user_principal = parameters[USER_PRINCIPAL_SCRIPT_PARAM_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_SCRIPT_PARAM_KEY in parameters: user_keytab = parameters[USER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if USER_PRINCIPAL_KEY in configurations: user_principal = configurations[USER_PRINCIPAL_KEY] user_principal = user_principal.replace('_HOST', host_name.lower()) if USER_KEYTAB_KEY in configurations: user_keytab = configurations[USER_KEYTAB_KEY] # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format( env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} # Get the configured Kerberos executable search paths, if any kerberos_executable_search_paths = None if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] klist_path_local = get_klist_path(kerberos_executable_search_paths) kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinit_part_command = format( "{kinit_path_local} -l 5m20s -c {ccache_file} -kt {user_keytab} {user_principal}; " ) # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated if only_kinit: kinit_command = kinit_part_command else: kinit_command = "{0} -s {1} || ".format( klist_path_local, ccache_file) + kinit_part_command # prevent concurrent kinit kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: Execute(kinit_command, environment=kerberos_env, user=user) finally: kinit_lock.release() # Configure stack root stack_root = STACK_ROOT_DEFAULT if STACK_NAME_KEY in configurations and STACK_ROOT_KEY in configurations: stack_root = stack_tools.get_stack_root( configurations[STACK_NAME_KEY], configurations[STACK_ROOT_KEY]).lower() # oozie configuration directory using a symlink oozie_config_directory = OOZIE_CONF_DIR.replace(STACK_ROOT_PATTERN, stack_root) if not os.path.exists(oozie_config_directory): oozie_config_directory = OOZIE_CONF_DIR_LEGACY command = "source {0}/oozie-env.sh ; oozie admin -oozie {1} -status".format( oozie_config_directory, oozie_url) return (command, kerberos_env, user)
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ LLAP_APP_STATUS_CMD_TIMEOUT = 0 if configurations is None: return ('UNKNOWN', ['There were no configurations supplied to the script.']) result_code = None try: security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT if CHECK_COMMAND_TIMEOUT_KEY in configurations: check_command_timeout = int(parameters[CHECK_COMMAND_TIMEOUT_KEY]) hive_user = HIVE_USER_DEFAULT if HIVE_USER_KEY in configurations: hive_user = configurations[HIVE_USER_KEY] llap_app_name = LLAP_APP_NAME_DEFAULT if LLAP_APP_NAME_KEY in configurations: llap_app_name = configurations[LLAP_APP_NAME_KEY] if security_enabled: if HIVE_PRINCIPAL_KEY in configurations: llap_principal = configurations[HIVE_PRINCIPAL_KEY] else: llap_principal = HIVE_PRINCIPAL_DEFAULT llap_principal = llap_principal.replace('_HOST', host_name.lower()) llap_keytab = HIVE_PRINCIPAL_KEYTAB_DEFAULT if HIVE_PRINCIPAL_KEYTAB_KEY in configurations: llap_keytab = configurations[HIVE_PRINCIPAL_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinitcmd = format( "{kinit_path_local} -kt {llap_keytab} {llap_principal}; ") # prevent concurrent kinit kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: Execute(kinitcmd, user=hive_user, path=[ "/bin/", "/usr/bin/", "/usr/lib/hive/bin/", "/usr/sbin/" ], timeout=10) finally: kinit_lock.release() start_time = time.time() if STACK_NAME in configurations and STACK_ROOT in configurations: stack_root = stack_tools.get_stack_root(configurations[STACK_NAME], configurations[STACK_ROOT]) llap_status_cmd = stack_root + format( "/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name} --findAppTimeout {LLAP_APP_STATUS_CMD_TIMEOUT}" ) else: llap_status_cmd = STACK_ROOT_DEFAULT + format( "/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name} --findAppTimeout {LLAP_APP_STATUS_CMD_TIMEOUT}" ) code, output, error = shell.checked_call(llap_status_cmd, user=hive_user, stderr=subprocess.PIPE, timeout=check_command_timeout, logoutput=False) # Call for getting JSON llap_app_info = make_valid_json(output) if llap_app_info is None or 'state' not in llap_app_info: alert_label = traceback.format_exc() result_code = UKNOWN_STATUS_CODE return (result_code, [alert_label]) retrieved_llap_app_state = llap_app_info['state'].upper() if retrieved_llap_app_state in ['RUNNING_ALL']: result_code = OK_RESULT_CODE total_time = time.time() - start_time alert_label = OK_MESSAGE.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) elif retrieved_llap_app_state in ['RUNNING_PARTIAL']: live_instances = 0 desired_instances = 0 percentInstancesUp = 0 percent_desired_instances_to_be_up = 80 # Get 'live' and 'desired' instances if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time alert_label = CRITICAL_MESSAGE_WITH_STATE.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) return (result_code, [alert_label]) live_instances = llap_app_info['liveInstances'] desired_instances = llap_app_info['desiredInstances'] if live_instances < 0 or desired_instances <= 0: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time alert_label = CRITICAL_MESSAGE_WITH_STATE.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) return (result_code, [alert_label]) percentInstancesUp = float( live_instances) / desired_instances * 100 if percentInstancesUp >= percent_desired_instances_to_be_up: result_code = OK_RESULT_CODE total_time = time.time() - start_time alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time, llap_app_info['liveInstances'], llap_app_info['desiredInstances']) else: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time, llap_app_info['liveInstances'], llap_app_info['desiredInstances']) else: result_code = CRITICAL_RESULT_CODE total_time = time.time() - start_time alert_label = CRITICAL_MESSAGE_WITH_STATE.format( llap_app_state_dict.get(retrieved_llap_app_state, retrieved_llap_app_state), total_time) except: alert_label = traceback.format_exc() traceback.format_exc() result_code = UKNOWN_STATUS_CODE return (result_code, [alert_label])
StackFeature.RANGER_AUDIT_DB_SUPPORT, version_for_stack_feature_checks) stack_supports_core_site_for_ranger_plugin = check_stack_feature( StackFeature.CORE_SITE_FOR_RANGER_PLUGINS_SUPPORT, version_for_stack_feature_checks) # This is the version whose state is CURRENT. During an RU, this is the source version. # DO NOT format it since we need the build number too. upgrade_from_version = upgrade_summary.get_source_version() source_stack = default("/commandParams/source_stack", None) if source_stack is None: source_stack = upgrade_summary.get_source_stack("KNOX") source_stack_name = get_stack_name(source_stack) if source_stack_name is not None and source_stack_name != stack_name: source_stack_root = get_stack_root( source_stack_name, default('/configurations/cluster-env/stack_root', None)) else: source_stack_root = stack_root # server configurations # Default value used in HDP 2.3.0.0 and earlier. knox_data_dir = '/var/lib/knox/data' # Important, it has to be strictly greater than 2.3.0.0!!! Logger.info(format("Stack version to use is {version_formatted}")) if version_formatted and check_stack_feature( StackFeature.KNOX_VERSIONED_DATA_DIR, version_formatted): # This is the current version. In the case of a Rolling Upgrade, it will be the newer version. # In the case of a Downgrade, it will be the version downgrading to. # This is always going to be a symlink to /var/lib/knox/data_${version}
def execute(configurations={}, parameters={}, host_name=None): """ Returns a tuple containing the result code and a pre-formatted result label Keyword arguments: configurations (dictionary): a mapping of configuration key to value parameters (dictionary): a mapping of script parameter key to value host_name (string): the name of this host where the alert is running """ if configurations is None: return (('UNKNOWN', ['There were no configurations supplied to the script.'])) if not HIVE_METASTORE_URIS_KEY in configurations: return (('UNKNOWN', ['Hive metastore uris were not supplied to the script.'])) metastore_uris = configurations[HIVE_METASTORE_URIS_KEY].split(',') security_enabled = False if SECURITY_ENABLED_KEY in configurations: security_enabled = str( configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE' check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT if CHECK_COMMAND_TIMEOUT_KEY in parameters: check_command_timeout = float(parameters[CHECK_COMMAND_TIMEOUT_KEY]) # defaults smokeuser_keytab = SMOKEUSER_KEYTAB_DEFAULT smokeuser_principal = SMOKEUSER_PRINCIPAL_DEFAULT smokeuser = SMOKEUSER_DEFAULT # check script params if SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY in parameters: smokeuser_principal = parameters[SMOKEUSER_PRINCIPAL_SCRIPT_PARAM_KEY] if SMOKEUSER_SCRIPT_PARAM_KEY in parameters: smokeuser = parameters[SMOKEUSER_SCRIPT_PARAM_KEY] if SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY in parameters: smokeuser_keytab = parameters[SMOKEUSER_KEYTAB_SCRIPT_PARAM_KEY] # check configurations last as they should always take precedence if SMOKEUSER_PRINCIPAL_KEY in configurations: smokeuser_principal = configurations[SMOKEUSER_PRINCIPAL_KEY] if SMOKEUSER_KEY in configurations: smokeuser = configurations[SMOKEUSER_KEY] result_code = None try: if security_enabled: if SMOKEUSER_KEYTAB_KEY in configurations: smokeuser_keytab = configurations[SMOKEUSER_KEYTAB_KEY] # Get the configured Kerberos executable search paths, if any if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations: kerberos_executable_search_paths = configurations[ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY] else: kerberos_executable_search_paths = None kinit_path_local = get_kinit_path(kerberos_executable_search_paths) kinitcmd = format( "{kinit_path_local} -kt {smokeuser_keytab} {smokeuser_principal}; " ) # prevent concurrent kinit kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: Execute(kinitcmd, user=smokeuser, path=[ "/bin/", "/usr/bin/", "/usr/lib/hive/bin/", "/usr/sbin/" ], timeout=10) finally: kinit_lock.release() if host_name is None: host_name = socket.getfqdn() for uri in metastore_uris: if host_name in uri: metastore_uri = uri conf_dir = HIVE_CONF_DIR_LEGACY bin_dir = HIVE_BIN_DIR_LEGACY if STACK_NAME in configurations and STACK_ROOT in configurations: stack_root = stack_tools.get_stack_root(configurations[STACK_NAME], configurations[STACK_ROOT]) hive_conf_dir = stack_root + format("/current/hive-metastore/conf") hive_bin_dir = stack_root + format("/current/hive-metastore/bin") if os.path.exists(hive_conf_dir): conf_dir = hive_conf_dir bin_dir = hive_bin_dir cmd = format("export HIVE_CONF_DIR='{conf_dir}' ; " "hive --hiveconf hive.metastore.uris={metastore_uri}\ --hiveconf hive.metastore.client.connect.retry.delay=1\ --hiveconf hive.metastore.failure.retries=1\ --hiveconf hive.metastore.connect.retries=1\ --hiveconf hive.metastore.client.socket.timeout=14\ --hiveconf hive.execution.engine=mr -e 'show databases;'") start_time = time.time() try: Execute( cmd, user=smokeuser, path=["/bin/", "/usr/bin/", "/usr/sbin/", bin_dir], timeout=int(check_command_timeout), timeout_kill_strategy=TerminateStrategy.KILL_PROCESS_TREE, ) total_time = time.time() - start_time result_code = 'OK' label = OK_MESSAGE.format(total_time) except: result_code = 'CRITICAL' label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc()) except: label = traceback.format_exc() result_code = 'UNKNOWN' return ((result_code, [label]))