def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail( '"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format( action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling" and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) options = "" if upgrade_type == "rolling": options = "-rollingUpgrade started" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info( format( "Previous file system image dir present is {is_previous_image_dir}" )) if params.dfs_ha_enabled: if params.desired_namenode_role is None: raise Fail( "Did not receive parameter \"desired_namenode_role\" to indicate the role that this NameNode should have." ) if params.desired_namenode_role == "active": # The "-upgrade" command can only be used exactly once. If used more than once during a retry, it will cause problems. options = "" if is_previous_image_dir else "-upgrade" if params.desired_namenode_role == "standby": options = "-bootstrapStandby -force" else: # Both Primary and Secondary NameNode can use the same command. options = "" if is_previous_image_dir else "-upgrade" Logger.info(format("Option for start command: {options}")) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) is_namenode_safe_mode_off = format( "{hdfs_binary} dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'" ) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format( "{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active" ), params.hdfs_user, env={ 'PATH': params.hadoop_bin_dir }) else: is_active_namenode_cmd = False # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: # During normal operations, the NameNode is expected to be up. code, out = shell.call( is_active_namenode_cmd, logoutput=True) # If active NN, code will be 0 if code == 0: # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: Logger.info( "Wait to leafe safemode since must transition from ON to OFF." ) try: # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=180, try_sleep=10, user=params.hdfs_user, logoutput=True) except Fail: Logger.error( "NameNode is still in safemode, please be careful with commands that need safemode OFF." ) # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service(action="stop", name="namenode", user=params.hdfs_user) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, upgrade_suspended=False, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail( '"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) # set up failover / secure zookeper ACLs, this feature is supported from HDP 2.6 ownwards set_up_zkfc_security(params) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format( action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) if do_format and not params.hdfs_namenode_format_disabled: format_namenode() pass if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == constants.UPGRADE_TYPE_ROLLING and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) options = "" if upgrade_type == constants.UPGRADE_TYPE_ROLLING: if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == constants.UPGRADE_TYPE_NON_ROLLING: is_previous_image_dir = is_previous_fs_image() Logger.info("Previous file system image dir present is {0}".format( str(is_previous_image_dir))) if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == constants.UPGRADE_TYPE_HOST_ORDERED: # nothing special to do for HOU - should be very close to a normal restart pass elif upgrade_type is None and upgrade_suspended is True: # the rollingUpgrade flag must be passed in during a suspended upgrade when starting NN if os.path.exists( namenode_upgrade.get_upgrade_in_progress_marker()): options = "-rollingUpgrade started" else: Logger.info( "The NameNode upgrade marker file {0} does not exist, yet an upgrade is currently suspended. " "Assuming that the upgrade of NameNode has not occurred yet." .format(namenode_upgrade.get_upgrade_in_progress_marker())) Logger.info("Options for start command are: {0}".format(options)) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | No | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | ON -> OFF | No | # EU with HA on standby | ON -> OFF | No | # EU non-HA | ON -> OFF | No | # because we do things like create directories after starting NN, # the vast majority of the time this should be True - it should only # be False if this is HA and we are the Standby NN ensure_safemode_off = True # True if this is the only NameNode (non-HA) or if its the Active one in HA is_active_namenode = True if params.dfs_ha_enabled: Logger.info( "Waiting for the NameNode to broadcast whether it is Active or Standby..." ) if is_this_namenode_active() is False: # we are the STANDBY NN is_active_namenode = False # we are the STANDBY NN and this restart is not part of an upgrade if upgrade_type is None: ensure_safemode_off = False # During an Express Upgrade, NameNode will not leave SafeMode until the DataNodes are started, # so always disable the Safemode check if upgrade_type == constants.UPGRADE_TYPE_NON_ROLLING: ensure_safemode_off = False # some informative logging separate from the above logic to keep things a little cleaner if ensure_safemode_off: Logger.info( "Waiting for this NameNode to leave Safemode due to the following conditions: HA: {0}, isActive: {1}, upgradeType: {2}" .format(params.dfs_ha_enabled, is_active_namenode, upgrade_type)) else: Logger.info( "Skipping Safemode check due to the following conditions: HA: {0}, isActive: {1}, upgradeType: {2}" .format(params.dfs_ha_enabled, is_active_namenode, upgrade_type)) # wait for Safemode to end if ensure_safemode_off: if params.rolling_restart and params.rolling_restart_safemode_exit_timeout: calculated_retries = int( params.rolling_restart_safemode_exit_timeout) / 30 wait_for_safemode_off(hdfs_binary, afterwait_sleep=30, retries=calculated_retries, sleep_seconds=30) else: wait_for_safemode_off(hdfs_binary) # Always run this on the "Active" NN unless Safemode has been ignored # in the case where safemode was ignored (like during an express upgrade), then # NN will be in SafeMode and cannot have directories created if is_active_namenode and ensure_safemode_off: create_hdfs_directories() create_ranger_audit_hdfs_directories() else: Logger.info( "Skipping creation of HDFS directories since this is either not the Active NameNode or we did not wait for Safemode to finish." ) elif action == "stop": import params service(action="stop", name="namenode", user=params.hdfs_user) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def namenode(action=None, do_format=True, rolling_restart=False, env=None): import params #we need this directory to be present before any action(HA manual steps for #additional namenode) if action == "configure": create_name_dirs(params.dfs_name_dir) if action == "start": if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) Directory(params.hadoop_pid_dir_prefix, mode=0755, owner=params.hdfs_user, group=params.user_group) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") options = "-rollingUpgrade started" if rolling_restart else "" if rolling_restart: # Must start Zookeeper Failover Controller if it exists on this host because it could have been killed in order to initiate the failover. safe_zkfc_op(action, env) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) if params.dfs_ha_enabled: dfs_check_nn_status_cmd = as_user(format( "hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active" ), params.hdfs_user, env={ 'PATH': params.hadoop_bin_dir }) else: dfs_check_nn_status_cmd = None namenode_safe_mode_off = format( "hadoop dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'" ) # If HA is enabled and it is in standby, then stay in safemode, otherwise, leave safemode. leave_safe_mode = True if dfs_check_nn_status_cmd is not None: code, out = shell.call( dfs_check_nn_status_cmd) # If active NN, code will be 0 if code != 0: leave_safe_mode = False if leave_safe_mode: # First check if Namenode is not in 'safemode OFF' (equivalent to safemode ON), if so, then leave it code, out = shell.call(namenode_safe_mode_off) if code != 0: leave_safe_mode_cmd = format( "hdfs --config {hadoop_conf_dir} dfsadmin -fs {namenode_address} -safemode leave" ) Execute( leave_safe_mode_cmd, user=params.hdfs_user, path=[params.hadoop_bin_dir], ) # Verify if Namenode should be in safemode OFF Execute( namenode_safe_mode_off, tries=40, try_sleep=10, path=[params.hadoop_bin_dir], user=params.hdfs_user, only_if=dfs_check_nn_status_cmd #skip when HA not active ) create_hdfs_directories(dfs_check_nn_status_cmd) if action == "stop": service(action="stop", name="namenode", user=params.hdfs_user) if action == "decommission": decommission()
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail( '"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format( action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params if do_format and not params.hdfs_namenode_format_disabled: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling" and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) options = "" if upgrade_type == "rolling": if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info( format( "Previous file system image dir present is {is_previous_image_dir}" )) if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" Logger.info(format("Option for start command: {options}")) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format( "{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active" ), params.hdfs_user, env={ 'PATH': params.hadoop_bin_dir }) else: is_active_namenode_cmd = True # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: Logger.info("Wait for NameNode to become active.") if is_active_namenode(hdfs_binary): # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: wait_for_safemode_off(hdfs_binary) # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) create_ranger_audit_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service(action="stop", name="namenode", user=params.hdfs_user) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail('"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format(action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params if do_format and not params.hdfs_namenode_format_disabled: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group ) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling" and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) options = "" if upgrade_type == "rolling": if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info(format("Previous file system image dir present is {is_previous_image_dir}")) if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" Logger.info(format("Option for start command: {options}")) service( action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True ) if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: is_active_namenode_cmd = True # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: Logger.info("Wait for NameNode to become active.") if is_active_namenode(hdfs_binary): # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: wait_for_safemode_off(hdfs_binary) # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) create_ranger_audit_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service( action="stop", name="namenode", user=params.hdfs_user ) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def namenode(action=None, do_format=True, rolling_restart=False, env=None): if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": setup_ranger_hdfs(rolling_upgrade=rolling_restart) import params if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") options = "-rollingUpgrade started" if rolling_restart else "" if rolling_restart: # Must start Zookeeper Failover Controller if it exists on this host because it could have been killed in order to initiate the failover. safe_zkfc_op(action, env) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) is_namenode_safe_mode_off = format( "hdfs dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'" ) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format( "hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active" ), params.hdfs_user, env={ 'PATH': params.hadoop_bin_dir }) else: is_active_namenode_cmd = None # During normal operations, if HA is enabled and it is in standby, then no need to check safemode staus. # During Rolling Upgrade, both namenodes must eventually leave safemode, and Ambari can wait for this. # ___Scenario_________|_Expected safemode state___|_Wait for safemode OFF____| # 1 (HA and active) | ON -> OFF | Yes | # 2 (HA and standby) | no change (yes during RU) | no check (yes during RU) | # 3 (no-HA) | ON -> OFF | Yes | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: code, out = shell.call( is_active_namenode_cmd, logoutput=True) # If active NN, code will be 0 if code == 0: # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." elif rolling_restart: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Rolling Upgrade" else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True if not msg: msg = "Will remain in the current safemode state." Logger.info(msg) if check_for_safemode_off: # First check if Namenode is not in 'safemode OFF' (equivalent to safemode ON). If safemode is OFF, no change. # If safemode is ON, first wait for NameNode to leave safemode on its own (if that doesn't happen within 30 seconds, then # force NameNode to leave safemode). Logger.info( "Checking the NameNode safemode status since may need to transition from ON to OFF." ) try: # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=180, try_sleep=10, user=params.hdfs_user, logoutput=True) except Fail: Logger.error( "NameNode is still in safemode, please be careful with commands that need safemode OFF." ) # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service(action="stop", name="namenode", user=params.hdfs_user) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def namenode(action=None, do_format=True, upgrade_type=None, env=None): import params #we need this directory to be present before any action(HA manual steps for #additional namenode) if action == "configure": create_name_dirs(params.dfs_name_dir) if action == "start": if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group ) Directory(params.hadoop_pid_dir_prefix, mode=0755, owner=params.hdfs_user, group=params.user_group ) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling": # Must start Zookeeper Failover Controller if it exists on this host because it could have been killed in order to initiate the failover. safe_zkfc_op(action, env) #options = "-rollingUpgrade started" if rolling_restart else "" options = "" if upgrade_type == "rolling": if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info(format("Previous file system image dir present is {is_previous_image_dir}")) if params.upgrade_direction == Direction.UPGRADE: options = "-rollingUpgrade started" elif params.upgrade_direction == Direction.DOWNGRADE: options = "-rollingUpgrade downgrade" Logger.info(format("Option for start command: {options}")) service( action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True ) if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format("hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: is_active_namenode_cmd = True # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: Logger.info("Wait for NameNode to become active.") if is_active_namenode(): # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: wait_for_safemode_off() # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) '''if params.dfs_ha_enabled: dfs_check_nn_status_cmd = as_user(format("hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: dfs_check_nn_status_cmd = None namenode_safe_mode_off = format("hdfs dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'") # If HA is enabled and it is in standby, then stay in safemode, otherwise, leave safemode. leave_safe_mode = True if dfs_check_nn_status_cmd is not None: code, out = shell.call(dfs_check_nn_status_cmd) # If active NN, code will be 0 if code != 0: leave_safe_mode = False if leave_safe_mode: # First check if Namenode is not in 'safemode OFF' (equivalent to safemode ON), if so, then leave it code, out = shell.call(namenode_safe_mode_off) if code != 0: leave_safe_mode_cmd = format("hdfs --config {hadoop_conf_dir} dfsadmin -fs {namenode_address} -safemode leave") Execute(leave_safe_mode_cmd, tries=10, try_sleep=10, user=params.hdfs_user, path=[params.hadoop_bin_dir], ) # Verify if Namenode should be in safemode OFF Execute(namenode_safe_mode_off, tries=40, try_sleep=10, path=[params.hadoop_bin_dir], user=params.hdfs_user, only_if=dfs_check_nn_status_cmd #skip when HA not active ) create_hdfs_directories(dfs_check_nn_status_cmd)''' if action == "stop": service( action="stop", name="namenode", user=params.hdfs_user ) if action == "decommission": decommission()
def namenode(action=None, do_format=True, rolling_restart=False, env=None): import params #we need this directory to be present before any action(HA manual steps for #additional namenode) if action == "configure": create_name_dirs(params.dfs_name_dir) if action == "start": if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group ) Directory(params.hadoop_pid_dir_prefix, mode=0755, owner=params.hdfs_user, group=params.user_group ) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") options = "-rollingUpgrade started" if rolling_restart else "" if rolling_restart: # Must start Zookeeper Failover Controller if it exists on this host because it could have been killed in order to initiate the failover. safe_zkfc_op(action, env) service( action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True ) if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) if params.dfs_ha_enabled: dfs_check_nn_status_cmd = as_user(format("hdfs --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: dfs_check_nn_status_cmd = None namenode_safe_mode_off = format("hadoop dfsadmin -safemode get | grep 'Safe mode is OFF'") # If HA is enabled and it is in standby, then stay in safemode, otherwise, leave safemode. leave_safe_mode = True if dfs_check_nn_status_cmd is not None: code, out = shell.call(dfs_check_nn_status_cmd) # If active NN, code will be 0 if code != 0: leave_safe_mode = False if leave_safe_mode: # First check if Namenode is not in 'safemode OFF' (equivalent to safemode ON), if so, then leave it code, out = shell.call(namenode_safe_mode_off) if code != 0: leave_safe_mode_cmd = format("hdfs --config {hadoop_conf_dir} dfsadmin -safemode leave") try: Execute(leave_safe_mode_cmd, user=params.hdfs_user, path=[params.hadoop_bin_dir], ) except: # Sleep and retry time.sleep(60) Execute(leave_safe_mode_cmd, user=params.hdfs_user, path=[params.hadoop_bin_dir], ) # Verify if Namenode should be in safemode OFF Execute(namenode_safe_mode_off, tries=40, try_sleep=10, path=[params.hadoop_bin_dir], user=params.hdfs_user, only_if=dfs_check_nn_status_cmd #skip when HA not active ) create_hdfs_directories(dfs_check_nn_status_cmd) if action == "stop": service( action="stop", name="namenode", user=params.hdfs_user ) if action == "decommission": decommission()