def is_this_namenode_active(name_service): """ Gets whether the current NameNode is Active. This function will wait until the NameNode is listed as being either Active or Standby before returning a value. This is to ensure that that if the other NameNode is Active, we ensure that this NameNode has fully loaded and registered in the event that the other NameNode is going to be restarted. This prevents a situation where we detect the other NameNode as Active before this NameNode has fully booted. If the other Active NameNode is then restarted, there can be a loss of service if this NameNode has not entered Standby. """ import params # returns ([('nn1', 'c6401.ambari.apache.org:50070')], [('nn2', 'c6402.ambari.apache.org:50070')], []) # 0 1 2 # or # returns ([], [('nn1', 'c6401.ambari.apache.org:50070')], [('nn2', 'c6402.ambari.apache.org:50070')], []) # 0 1 2 # namenode_states = namenode_ha_utils.get_namenode_states( params.hdfs_site, params.security_enabled, params.hdfs_user, times=5, sleep_time=5, backoff_factor=2, name_service=name_service) # unwraps [('nn1', 'c6401.ambari.apache.org:50070')] active_namenodes = [] if len( namenode_states[0]) < 1 else namenode_states[0] # unwraps [('nn2', 'c6402.ambari.apache.org:50070')] standby_namenodes = [] if len( namenode_states[1]) < 1 else namenode_states[1] # check to see if this is the active NameNode for entry in active_namenodes: if params.namenode_id in entry: return True # if this is not the active NameNode, then we must wait for it to register as standby for entry in standby_namenodes: if params.namenode_id in entry: return False # this this point, this NameNode is neither active nor standby - we must wait to ensure it # enters at least one of these roles before returning a verdict - the annotation will catch # this failure and retry the fuction automatically raise Fail( format( "The NameNode {namenode_id} is not listed as Active or Standby, waiting..." ))
def is_there_any_active_nn(name_service): import params namenode_states = namenode_ha_utils.get_namenode_states( params.hdfs_site, params.security_enabled, params.hdfs_user, times=3, sleep_time=3, backoff_factor=2, name_service=name_service) active_namenodes = [] if len( namenode_states[0]) < 1 else namenode_states[0] # namenode_states[1] contains standby NN return len(active_namenodes) > 0
def is_there_any_active_nn(name_service): import params namenode_states = namenode_ha_utils.get_namenode_states( params.hdfs_site, params.security_enabled, params.hdfs_user, times=3, sleep_time=3, backoff_factor=2, name_service=name_service) # unwraps [('nn1', 'c6401.ambari.apache.org:50070')] active_namenodes = [] if len( namenode_states[0]) < 1 else namenode_states[0] # namenode_states[1] contains standby NN return len(active_namenodes) > 0
def initiate_safe_zkfc_failover(): """ If this is the active namenode, initiate a safe failover and wait for it to become the standby. If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart, will also have to start ZKFC manually. """ import params # Must kinit before running the HDFS command if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user=params.hdfs_user) active_namenode_id = None standby_namenode_id = None active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states( params.hdfs_site, params.security_enabled, params.hdfs_user) if active_namenodes: active_namenode_id = active_namenodes[0][0] if standby_namenodes: standby_namenode_id = standby_namenodes[0][0] if active_namenode_id: Logger.info(format("Active NameNode id: {active_namenode_id}")) if standby_namenode_id: Logger.info(format("Standby NameNode id: {standby_namenode_id}")) if unknown_namenodes: for unknown_namenode in unknown_namenodes: Logger.info("NameNode HA state for {0} is unknown".format( unknown_namenode[0])) if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id: # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover) Logger.info( format( "NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby" )) failover_command = format( "hdfs haadmin -ns {dfs_ha_nameservices} -failover {namenode_id} {other_namenode_id}" ) check_standby_cmd = format( "hdfs haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep standby" ) msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format( params.hostname) Logger.info(msg) code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True) Logger.info( format("Rolling Upgrade - failover command returned {code}")) wait_for_standby = False if code == 0: wait_for_standby = True else: # Try to kill ZKFC manually was_zkfc_killed = kill_zkfc(params.hdfs_user) code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True) Logger.info( format("Rolling Upgrade - check for standby returned {code}")) if code == 255 and out: Logger.info("Rolling Upgrade - NameNode is already down.") else: if was_zkfc_killed: # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover. wait_for_standby = True if wait_for_standby: Logger.info("Waiting for this NameNode to become the standby one.") Execute(check_standby_cmd, user=params.hdfs_user, tries=50, try_sleep=6, logoutput=True) else: msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format( params.hostname) Logger.info(msg)
def initiate_safe_zkfc_failover(): """ If this is the active namenode, initiate a safe failover and wait for it to become the standby. If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart, will also have to start ZKFC manually. """ import params # Must kinit before running the HDFS command if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) active_namenode_id = None standby_namenode_id = None active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states(params.hdfs_site, params.security_enabled, params.hdfs_user) if active_namenodes: active_namenode_id = active_namenodes[0][0] if standby_namenodes: standby_namenode_id = standby_namenodes[0][0] if active_namenode_id: Logger.info(format("Active NameNode id: {active_namenode_id}")) if standby_namenode_id: Logger.info(format("Standby NameNode id: {standby_namenode_id}")) if unknown_namenodes: for unknown_namenode in unknown_namenodes: Logger.info("NameNode HA state for {0} is unknown".format(unknown_namenode[0])) if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id: # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover) Logger.info(format("NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby")) failover_command = format("hdfs haadmin -failover {namenode_id} {other_namenode_id}") check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby") msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format(params.hostname) Logger.info(msg) code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True) Logger.info(format("Rolling Upgrade - failover command returned {code}")) wait_for_standby = False if code == 0: wait_for_standby = True else: # Try to kill ZKFC manually was_zkfc_killed = kill_zkfc(params.hdfs_user) code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True) Logger.info(format("Rolling Upgrade - check for standby returned {code}")) if code == 255 and out: Logger.info("Rolling Upgrade - NameNode is already down.") else: if was_zkfc_killed: # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover. wait_for_standby = True if wait_for_standby: Logger.info("Waiting for this NameNode to become the standby one.") Execute(check_standby_cmd, user=params.hdfs_user, tries=50, try_sleep=6, logoutput=True) else: msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format(params.hostname) Logger.info(msg)