def is_active_namenode(hdfs_binary): """ Checks if current NameNode is active. Waits up to 30 seconds. If other NameNode is active returns False. :return: True if current NameNode is active, False otherwise """ import params if params.dfs_ha_enabled: is_active_this_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) is_active_other_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {other_namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) for i in range(0, 5): code, out = shell.call(is_active_this_namenode_cmd) # If active NN, code will be 0 if code == 0: # active return True code, out = shell.call(is_active_other_namenode_cmd) # If other NN is active, code will be 0 if code == 0: # other NN is active return False if i < 4: # Do not sleep after last iteration time.sleep(6) Logger.info("Active NameNode is not found.") return False else: return True
def get_write_lock_files_solr_cloud(hadoop_prefix, collections): import params write_locks_to_delete = '' for collection_path in collections: code, output = call(format('{hadoop_prefix} -ls {collection_path}')) core_paths = get_core_paths(output, collection_path) collection_name = collection_path.replace(format('{solr_hdfs_directory}/'), '') zk_code, zk_output = call(format( '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}/collections/{collection_name}/state.json'), env={'JAVA_HOME': params.java64_home}, timeout=60 ) if zk_code != 0: Logger.error(format('Cannot determine cores owned by [{solr_hostname}] in collection [{collection_name}] due to ZK error.')) continue for core_path in core_paths: core_node_name = core_path.replace(format('{collection_path}/'), '') pattern = re.compile(format(HOSTNAME_VERIFIER_PATTERN), re.MULTILINE|re.DOTALL) core_on_hostname = re.search(pattern, zk_output) if core_on_hostname is not None: write_locks_to_delete += WRITE_LOCK_PATTERN.format(core_path) return write_locks_to_delete
def post_upgrade_restart(self, env, upgrade_type=None): if upgrade_type == "nonrolling": return Logger.info("Executing Stack Upgrade post-restart") import params env.set_params(params) zk_server_host = random.choice(params.zookeeper_hosts) cli_shell = format("{zk_cli_shell} -server {zk_server_host}:{client_port}") # Ensure that a quorum is still formed. unique = get_unique_id_and_date() create_command = format("echo 'create /{unique} mydata' | {cli_shell}") list_command = format("echo 'ls /' | {cli_shell}") delete_command = format("echo 'delete /{unique} ' | {cli_shell}") quorum_err_message = "Failed to establish zookeeper quorum" call_and_match_output(create_command, 'Created', quorum_err_message, user=params.zk_user) call_and_match_output(list_command, r"\[.*?" + unique + ".*?\]", quorum_err_message, user=params.zk_user) shell.call(delete_command, user=params.zk_user) if params.client_port: check_leader_command = format("echo stat | nc localhost {client_port} | grep Mode") code, out = shell.call(check_leader_command, logoutput=False) if code == 0 and out: Logger.info(out)
def actionexecute(self, env): config = Script.get_config() version = default('/commandParams/version', None) stack_name = default('/hostLevelParams/stack_name', "") if not version: raise Fail("Value is required for '/commandParams/version'") # other os? if OSCheck.is_redhat_family(): cmd = ('/usr/bin/yum', 'clean', 'all') code, out = shell.call(cmd, sudo=True) min_ver = format_hdp_stack_version("2.2") real_ver = format_hdp_stack_version(version) if stack_name == "HDP": if compare_versions(real_ver, min_ver) >= 0: cmd = ('hdp-select', 'set', 'all', version) code, out = shell.call(cmd, sudo=True) if compare_versions(real_ver, format_hdp_stack_version("2.3")) >= 0: # backup the old and symlink /etc/[component]/conf to /usr/hdp/current/[component] for k, v in conf_select.PACKAGE_DIRS.iteritems(): for dir_def in v: link_config(dir_def['conf_dir'], dir_def['current_dir'])
def prepare_rpcbind(): Logger.info("check if native nfs server is running") p, output = shell.call("pgrep nfsd") if p == 0: Logger.info("native nfs server is running. shutting it down...") # shutdown nfs shell.call("service nfs stop") shell.call("service nfs-kernel-server stop") Logger.info("check if the native nfs server is down...") p, output = shell.call("pgrep nfsd") if p == 0: raise Fail("Failed to shutdown native nfs service") Logger.info("check if rpcbind or portmap is running") p, output = shell.call("pgrep rpcbind") q, output = shell.call("pgrep portmap") if p != 0 and q != 0: Logger.info("no portmap or rpcbind running. starting one...") p, output = shell.call(("service", "rpcbind", "start"), sudo=True) q, output = shell.call(("service", "portmap", "start"), sudo=True) if p != 0 and q != 0: raise Fail("Failed to start rpcbind or portmap") Logger.info("now we are ready to start nfs gateway")
def _check_existence(self, name): code, out = shell.call(CHECK_CMD % name) if bool(code): return False elif '*' in name or '?' in name: # Check if all packages matching pattern are installed code1, out1 = shell.call(GET_NOT_INSTALLED_CMD % name) return NO_PACKAGES_FOUND_STATUS in out1.splitlines() else: return True
def initiate_safe_zkfc_failover(): """ If this is the active namenode, initiate a safe failover and wait for it to become the standby. If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart, will also have to start ZKFC manually. """ import params # Must kinit before running the HDFS command if params.security_enabled: Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) check_service_cmd = format("hdfs haadmin -getServiceState {namenode_id}") code, out = shell.call(check_service_cmd, logoutput=True, user=params.hdfs_user) original_state = "unknown" if code == 0 and out: original_state = "active" if "active" in out else ("standby" if "standby" in out else original_state) Logger.info("Namenode service state: %s" % original_state) if original_state == "active": msg = "Rolling Upgrade - Initiating a ZKFC failover on {0} NameNode host {1}.".format(original_state, params.hostname) Logger.info(msg) check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby") failover_command = format("hdfs haadmin -failover {namenode_id} {other_namenode_id}") code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True) Logger.info(format("Rolling Upgrade - failover command returned {code}")) wait_for_standby = False if code == 0: wait_for_standby = True else: # Try to kill ZKFC manually was_zkfc_killed = kill_zkfc(params.hdfs_user) code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True) Logger.info(format("Rolling Upgrade - check for standby returned {code}")) if code == 255 and out: Logger.info("Rolling Upgrade - namenode is already down.") else: if was_zkfc_killed: # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover. wait_for_standby = True if wait_for_standby: Logger.info("Waiting for this NameNode to become the standby one.") Execute(check_standby_cmd, user=params.hdfs_user, tries=50, try_sleep=6, logoutput=True) else: raise Fail("Unable to determine NameNode HA states by calling command: {0}".format(check_service_cmd))
def _check_existence(self, name): if "." in name: # To work with names like 'zookeeper_2_2_1_0_2072.noarch' name = os.path.splitext(name)[0] code, out = shell.call(CHECK_CMD % name) if bool(code): return False elif "*" in name or "?" in name: # Check if all packages matching pattern are installed code1, out1 = shell.call(CHECK_AVAILABLE_PACKAGES_CMD % name) return not bool(code1) else: return True
def _init_cmd(self, command): if self._upstart: if command == "status": ret,out = shell.call(["/sbin/" + command, self.resource.service_name]) _proc, state = out.strip().split(' ', 1) ret = 0 if state != "stop/waiting" else 1 else: ret,out = shell.call(["/sbin/" + command, self.resource.service_name]) else: ret,out = shell.call(["/etc/init.d/%s" % self.resource.service_name, command]) return ret,out
def action_run(self): from tempfile import NamedTemporaryFile Logger.info("Running script %s" % self.resource) with NamedTemporaryFile(prefix="resource_management-script", bufsize=0) as tf: tf.write(self.resource.code) tf.flush() _ensure_metadata(tf.name, self.resource.user, self.resource.group) shell.call([self.resource.interpreter, tf.name], cwd=self.resource.cwd, env=self.resource.environment, preexec_fn=_preexec_fn(self.resource))
def _check_existence(self, name): code, out = shell.call(CHECK_EXISTENCE_CMD % name) if bool(code): return False elif '*' in name or '.' in name: # Check if all packages matching regexp are installed code1, out1 = shell.call(GET_PACKAGES_BY_PATTERN_CMD % name) for package_name in out1.splitlines(): code2, out2 = shell.call(GET_PACKAGE_STATUS_CMD % package_name) if PACKAGE_INSTALLED_STATUS not in out2.splitlines(): return False return True else: return True
def setup_solr_cloud(): import params code, output = call( format( '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}{clusterstate_json}' ), env={'JAVA_HOME': params.java64_home}, timeout=60 ) if not ("NoNodeException" in output): Logger.info( format( "ZK node {solr_cloud_zk_directory}{clusterstate_json} already exists, skipping ..." ) ) return Execute( format( '{zk_client_prefix} -cmd makepath {solr_cloud_zk_directory}' ), environment={'JAVA_HOME': params.java64_home}, ignore_failures=True, user=params.solr_config_user )
def kill_zkfc(zkfc_user): """ There are two potential methods for failing over the namenode, especially during a Rolling Upgrade. Option 1. Kill zkfc on primary namenode provided that the secondary is up and has zkfc running on it. Option 2. Silent failover (not supported as of HDP 2.2.0.0) :param zkfc_user: User that started the ZKFC process. :return: Return True if ZKFC was killed, otherwise, false. """ import params if params.dfs_ha_enabled: zkfc_pid_file = get_service_pid_file("zkfc", zkfc_user) if zkfc_pid_file: check_process = as_user(format("ls {zkfc_pid_file} > /dev/null 2>&1 && ps -p `cat {zkfc_pid_file}` > /dev/null 2>&1"), user=zkfc_user) code, out = shell.call(check_process) if code == 0: Logger.debug("ZKFC is running and will be killed.") kill_command = format("kill -15 `cat {zkfc_pid_file}`") Execute(kill_command, user=zkfc_user ) File(zkfc_pid_file, action = "delete", ) return True return False
def create(stack_name, package, version, dry_run = False): """ Creates a config version for the specified package :param stack_name: the name of the stack :param package: the name of the package, as-used by conf-select :param version: the version number to create :return List of directories created """ Logger.info("Checking if need to create versioned conf dir /etc/{0}/{1}/0".format(package, version)) if not _valid(stack_name, package, version): Logger.info("Will not create it since parameters are not valid.") return [] command = "dry-run-create" if dry_run else "create-conf-dir" code, stdout, stderr = shell.call(get_cmd(command, package, version), logoutput=False, quiet=False, sudo=True, stderr = subprocess.PIPE) # conf-select can set more than one directory # per package, so return that list, especially for dry_run dirs = [] if 0 == code and stdout is not None: # just be sure we have a stdout for line in stdout.splitlines(): dirs.append(line.rstrip('\n')) # take care of permissions if not code and stdout and command == "create-conf-dir": for d in dirs: Directory(d, mode=0755, cd_access='a', recursive=True) return dirs
def pre_rolling_upgrade_shutdown(hdfs_binary): """ Runs the "shutdownDatanode {ipc_address} upgrade" command to shutdown the DataNode in preparation for an upgrade. This will then periodically check "getDatanodeInfo" to ensure the DataNode has shutdown correctly. This function will obtain the Kerberos ticket if security is enabled. :param hdfs_binary: name/path of the HDFS binary to use :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully. """ import params Logger.info('DataNode executing "shutdownDatanode" command in preparation for upgrade...') if params.security_enabled: Execute(params.dn_kinit_cmd, user = params.hdfs_user) dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = format('{dfsadmin_base_command} -shutdownDatanode {dfs_dn_ipc_address} upgrade') code, output = shell.call(command, user=params.hdfs_user) if code == 0: # verify that the datanode is down _check_datanode_shutdown(hdfs_binary) else: # Due to bug HDFS-7533, DataNode may not always shutdown during stack upgrade, and it is necessary to kill it. if output is not None and re.search("Shutdown already in progress", output): Logger.error("Due to a known issue in DataNode, the command {0} did not work, so will need to shutdown the datanode forcefully.".format(command)) return False return True
def get_hdp_version(): try: command = 'hdp-select status hadoop-client' return_code, hdp_output = shell.call(command, timeout=20) except Exception, e: Logger.error(str(e)) raise Fail('Unable to execute hdp-select command to retrieve the version.')
def status(self, env): cmd = 'ps -ef | grep proc_rangerkms | grep -v grep' code, output = shell.call(cmd, timeout=20) if code != 0: Logger.debug('KMS process not running') raise ComponentIsNotRunning() pass
def remove_solr_ssl_support(): import params if not params.solr_cloud_mode: return code, output = call( format( '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}{clusterprops_json}' ), env={'JAVA_HOME': params.java64_home}, timeout=60 ) if "NoNodeException" in output: return Execute( format( '{zk_client_prefix} -cmd clear {solr_cloud_zk_directory}{clusterprops_json}' ), environment={'JAVA_HOME': params.java64_home}, ignore_failures=True, user=params.solr_config_user )
def _get_current_hiveserver_version(): """ Runs "hive --version" and parses the result in order to obtain the current version of hive. :return: the hiveserver2 version, returned by "hive --version" """ import params try: # When downgrading the source version should be the version we are downgrading from if "downgrade" == params.upgrade_direction: if not params.downgrade_from_version: raise Fail('The version from which we are downgrading from should be provided in \'downgrade_from_version\'') source_version = params.downgrade_from_version else: source_version = params.current_version hive_execute_path = _get_hive_execute_path(source_version) version_hive_bin = params.hive_bin formatted_source_version = format_hdp_stack_version(source_version) if formatted_source_version and compare_versions(formatted_source_version, "2.2") >= 0: version_hive_bin = format('/usr/hdp/{source_version}/hive/bin') command = format('{version_hive_bin}/hive --version') return_code, hdp_output = shell.call(command, user=params.hive_user, path=hive_execute_path) except Exception, e: Logger.error(str(e)) raise Fail('Unable to execute hive --version command to retrieve the hiveserver2 version.')
def get_hdp_version(): if not options.hdp_version: # Ubuntu returns: "stdin: is not a tty", as subprocess output. tmpfile = tempfile.NamedTemporaryFile() out = None with open(tmpfile.name, 'r+') as file: get_hdp_version_cmd = '/usr/bin/hdp-select status %s > %s' % ('hadoop-mapreduce-historyserver', tmpfile.name) code, stdoutdata = shell.call(get_hdp_version_cmd) out = file.read() pass if code != 0 or out is None: Logger.warning("Could not verify HDP version by calling '%s'. Return Code: %s, Output: %s." % (get_hdp_version_cmd, str(code), str(out))) return 1 matches = re.findall(r"([\d\.]+\-\d+)", out) hdp_version = matches[0] if matches and len(matches) > 0 else None if not hdp_version: Logger.error("Could not parse HDP version from output of hdp-select: %s" % str(out)) return 1 else: hdp_version = options.hdp_version return hdp_version
def _check_datanode_startup(hdfs_binary): """ Checks that a DataNode is reported as being alive via the "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :param hdfs_binary: name/path of the HDFS binary to use :return: """ import params import socket try: dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = dfsadmin_base_command + ' -report -live' return_code, hdfs_output = shell.call(command, user=params.hdfs_user) except: raise Fail('Unable to determine if the DataNode has started after upgrade.') if return_code == 0: hostname = params.hostname.lower() hostname_ip = socket.gethostbyname(params.hostname.lower()) if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower(): Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname)) return else: raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname)) # return_code is not 0, fail raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
def bootstrap_standby_namenode(params, use_path=False): bin_path = os.path.join(params.hadoop_bin_dir, '') if use_path else "" try: iterations = 50 bootstrap_cmd = format("{bin_path}hdfs namenode -bootstrapStandby -nonInteractive") # Blue print based deployments start both NN in parallel and occasionally # the first attempt to bootstrap may fail. Depending on how it fails the # second attempt may not succeed (e.g. it may find the folder and decide that # bootstrap succeeded). The solution is to call with -force option but only # during initial start if params.command_phase == "INITIAL_START": bootstrap_cmd = format("{bin_path}hdfs namenode -bootstrapStandby -nonInteractive -force") Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd)) for i in range(iterations): Logger.info('Try %d out of %d' % (i+1, iterations)) code, out = shell.call(bootstrap_cmd, logoutput=False, user=params.hdfs_user) if code == 0: Logger.info("Standby namenode bootstrapped successfully") return True elif code == 5: Logger.info("Standby namenode already bootstrapped") return True else: Logger.warning('Bootstrap standby namenode failed with %d error code. Will retry' % (code)) except Exception as ex: Logger.error('Bootstrap standby namenode threw an exception. Reason %s' %(str(ex))) return False
def _check_datanode_startup(): """ Checks that a DataNode is reported as being alive via the "hdfs dfsadmin -report -live" command. Once the DataNode is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :return: """ import params try: # 'su - hdfs -c "hdfs dfsadmin -report -live"' command = 'hdfs dfsadmin -report -live' return_code, hdfs_output = shell.call(command, user=params.hdfs_user) except: raise Fail('Unable to determine if the DataNode has started after upgrade.') if return_code == 0: if params.hostname.lower() in hdfs_output.lower(): Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname)) return else: raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname)) # return_code is not 0, fail raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
def _check_nodemanager_startup(): ''' Checks that a NodeManager is in a RUNNING state in the cluster via "yarn node -list -states=RUNNING" command. Once the NodeManager is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :return: ''' import params command = 'yarn node -list -states=RUNNING' try: # 'su - yarn -c "yarn node -status c6401.ambari.apache.org:45454"' return_code, yarn_output = shell.call(command, user=params.hdfs_user) except: raise Fail('Unable to determine if the NodeManager has started after upgrade.') if return_code == 0: hostname = params.hostname.lower() nodemanager_address = params.nm_address.lower() yarn_output = yarn_output.lower() if hostname in yarn_output or nodemanager_address in yarn_output: Logger.info('NodeManager with ID {0} has rejoined the cluster.'.format(nodemanager_address)) return else: raise Fail('NodeManager with ID {0} was not found in the list of running NodeManagers'.format(nodemanager_address)) raise Fail('Unable to determine if the NodeManager has started after upgrade (result code {0})'.format(str(return_code)))
def check_process_status(pid_file): """ Function checks whether process is running. Process is considered running, if pid file exists, and process with a pid, mentioned in pid file is running If process is not running, will throw ComponentIsNotRunning exception @param pid_file: path to service pid file """ if not pid_file or not os.path.isfile(pid_file): raise ComponentIsNotRunning() try: pid = int(sudo.read_file(pid_file)) except: Logger.debug("Pid file {0} does not exist".format(pid_file)) raise ComponentIsNotRunning() code, out = shell.call(["ps","-p", str(pid)]) if code: Logger.debug("Process with pid {0} is not running. Stale pid file" " at {1}".format(pid, pid_file)) raise ComponentIsNotRunning() pass
def check_nifi_process_status(self, pid_file): """ Function checks whether process is running. Process is considered running, if pid file exists, and process with a pid, mentioned in pid file is running If process is not running, will throw ComponentIsNotRunning exception @param pid_file: path to service pid file """ if not pid_file or not os.path.isfile(pid_file): raise ComponentIsNotRunning() try: lines = [line.rstrip('\n') for line in open(pid_file)] pid = int(lines[2].split('=')[1]); except: Logger.warn("Pid file {0} does not exist".format(pid_file)) raise ComponentIsNotRunning() code, out = shell.call(["ps","-p", str(pid)]) if code: Logger.debug("Process with pid {0} is not running. Stale pid file" " at {1}".format(pid, pid_file)) raise ComponentIsNotRunning() pass
def main(): # add service checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'[{{"ServiceInfo":{{"service_name":"{service_name}"}}}}]\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services'. format(service_name=SERVICE_NAME, server_url=SERVER_URL, cluster_name=CLUSTER_NAME)) # add components for component in COMPONENTS: checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'{{"components":[{{"ServiceComponentInfo":{{"component_name":"{component}"}}}}]}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services?ServiceInfo/service_name={service_name}'. format(service_name=SERVICE_NAME, component=component, server_url=SERVER_URL, cluster_name=CLUSTER_NAME)) # assign components to hosts for x in COMPONENTS_TO_HOSTS: for component, host in x.iteritems(): checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'{{"host_components":[{{"HostRoles":{{"component_name":"{component}"}}}}]}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/hosts?Hosts/host_name={host}'. format(host=host, component=component, server_url=SERVER_URL, cluster_name=CLUSTER_NAME)) # update and create all the service-specific configurations checked_call('curl -H \'X-Requested-By:anything\'-X GET -u admin:admin {server_url}/api/v1/stacks2/HDP/versions/{stack_version}/stackServices/{service_name}/configurations?fields=* > /tmp/config.json'. format(server_url=SERVER_URL, stack_version=STACK_VERSION, service_name=SERVICE_NAME)) with open('/tmp/config.json', "r") as f: d = json.load(f) configs = {} for x in d['items']: site_name = x['StackConfigurations']['type'][:-4] if not site_name in configs: configs[site_name] = {} config = configs[site_name] config[x['StackConfigurations']['property_name']] = x['StackConfigurations']['property_value'] for site_name, site_content in configs.iteritems(): code = call('/var/lib/tbds-server/resources/scripts/configs.sh get {hostname} {cluster_name} {site_name}'.format(hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name))[0] if code: print "Adding new site: "+site_name checked_call('curl -i -H \'X-Requested-By:anything\' -X PUT -d \'{{"Clusters":{{"desired_configs":{{"type":"{site_name}","tag":"version1","properties":{site_content}}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}'.format(site_name=site_name, site_content=json.dumps(site_content), server_url=SERVER_URL, cluster_name=CLUSTER_NAME)) else: timestamp = int(time.time()) print "Modifiying site: "+site_name+" version"+str(timestamp) checked_call('/var/lib/tbds-server/resources/scripts/configs.sh get {hostname} {cluster_name} {site_name} /tmp/current_site.json'.format(hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name)) with open('/tmp/current_site.json', "r") as f: fcontent = f.read() d = json.loads("{"+fcontent+"}") for k,v in site_content.iteritems(): d['properties'][k] = v checked_call('curl -i -H \'X-Requested-By:anything\' -X PUT -d \'{{"Clusters":{{"desired_configs":{{"type":"{site_name}","tag":"version{timestamp}","properties":{site_content}}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}'.format(site_name=site_name, timestamp=timestamp, site_content=json.dumps(d['properties']), server_url=SERVER_URL, cluster_name=CLUSTER_NAME)) for site_name, site_configs in CONFIGS_TO_CHANGE.iteritems(): for config_name, config_value in site_configs.iteritems(): print "Adding config "+config_name+"="+config_value+" to "+site_name checked_call('/var/lib/tbds-server/resources/scripts/configs.sh set {hostname} {cluster_name} {site_name} {config_name} {config_value}'.format(config_name=config_name, config_value=config_value, hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name)) # install all new components checked_call('curl -H \'X-Requested-By:anything\' -i -X PUT -d \'{{"RequestInfo": {{"context" :"Installing Services"}}, "Body": {{"ServiceInfo": {{"state": "INSTALLED"}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services?ServiceInfo/state=INIT'. format(server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
def check_fs_root(): import params fs_root_url = format("{fs_root}{hive_apps_whs_dir}") cmd = "/usr/lib/hive/bin/metatool -listFSRoot 2>/dev/null | grep hdfs://" code, out = call(cmd, user=params.hive_user) if code == 0 and fs_root_url.strip() != out.strip(): cmd = format("/usr/lib/hive/bin/metatool -updateLocation {fs_root}{hive_apps_whs_dir} {out}") Execute(cmd, user=params.hive_user)
def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary): """ Enter or leave safemode for the Namenode. :param user: user to perform action as :param safemode_state: Desired state of ON or OFF :param in_ha: bool indicating if Namenode High Availability is enabled :param hdfs_binary: name/path of the HDFS binary to use :return: Returns a tuple of (transition success, original state). If no change is needed, the indicator of success will be True """ Logger.info("Prepare to transition into safemode state %s" % safemode_state) import params original_state = SafeMode.UNKNOWN dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) safemode_base_command = dfsadmin_base_command + " -safemode " safemode_check_cmd = safemode_base_command + " get" grep_pattern = format("Safe mode is {safemode_state}") safemode_check_with_grep = format("{safemode_check_cmd} | grep '{grep_pattern}'") code, out = shell.call(safemode_check_cmd, user=user, logoutput=True) Logger.info("Command: %s\nCode: %d." % (safemode_check_cmd, code)) if code == 0 and out is not None: Logger.info(out) re_pattern = r"Safe mode is (\S*)" Logger.info("Pattern to search: {0}".format(re_pattern)) m = re.search(re_pattern, out, re.IGNORECASE) if m and len(m.groups()) >= 1: original_state = m.group(1).upper() if original_state == safemode_state: return (True, original_state) else: # Make a transition command = safemode_base_command + safemode_to_instruction[safemode_state] Execute(command, user=user, logoutput=True, path=[params.hadoop_bin_dir]) code, out = shell.call(safemode_check_with_grep, user=user) Logger.info("Command: %s\nCode: %d. Out: %s" % (safemode_check_with_grep, code, out)) if code == 0: return (True, original_state) return (False, original_state)
def status(self, env): cmd = "ps -ef | grep proc_rangeradmin | grep -v grep" code, output = shell.call(cmd, timeout=20) if code != 0: Logger.debug("Ranger admin process not running") raise ComponentIsNotRunning() pass
def status(self, env): import status_params env.set_params(status_params) if status_params.stack_supports_pid: check_process_status(status_params.ranger_admin_pid_file) return cmd = 'ps -ef | grep proc_rangeradmin | grep -v grep' code, output = shell.call(cmd, timeout=20) if code != 0: if self.is_ru_rangeradmin_in_progress( status_params.upgrade_marker_file): Logger.info( 'Ranger admin process not running - skipping as stack upgrade is in progress' ) else: Logger.debug('Ranger admin process not running') raise ComponentIsNotRunning() pass
def _get_single_version_from_hdp_select(): """ Call "hdp-select versions" and return the version string if only one version is available. :return: Returns a version string if successful, and None otherwise. """ # Ubuntu returns: "stdin: is not a tty", as subprocess output, so must use a temporary file to store the output. tmpfile = tempfile.NamedTemporaryFile() tmp_dir = Script.get_tmp_dir() tmp_file = os.path.join(tmp_dir, "copy_tarball_out.txt") hdp_version = None out = None get_hdp_versions_cmd = "/usr/bin/hdp-select versions > {0}".format( tmp_file) try: code, stdoutdata = shell.call(get_hdp_versions_cmd, logoutput=True) with open(tmp_file, 'r+') as file: out = file.read() except Exception, e: Logger.logger.exception( "Could not parse output of {0}. Error: {1}".format( str(tmp_file), str(e)))
def _exec_cmd(self, command, expect=None): if command != "status": self.log.info("%s command '%s'" % (self.resource, command)) custom_cmd = getattr(self.resource, "%s_command" % command, None) if custom_cmd: self.log.debug("%s executing '%s'" % (self.resource, custom_cmd)) if hasattr(custom_cmd, "__call__"): if custom_cmd(): ret = 0 else: ret = 1 else: ret, out = shell.call(custom_cmd) else: ret = self._init_cmd(command) if expect is not None and expect != ret: raise Fail( "%r command %s for service %s failed with return code: %d. %s" % (self, command, self.resource.service_name, ret, out)) return ret
def create(stack_name, package, version, dry_run = False): """ Creates a config version for the specified package :param stack_name: the name of the stack :param package: the name of the package, as-used by <conf-selector-tool> :param version: the version number to create :param dry_run: False to create the versioned config directory, True to only return what would be created :return List of directories created """ Logger.info("Checking if need to create versioned conf dir /etc/{0}/{1}/0".format(package, version)) if not _valid(stack_name, package, version): Logger.info("Will not create it since parameters are not valid.") return [] command = "dry-run-create" if dry_run else "create-conf-dir" code, stdout, stderr = shell.call(_get_cmd(command, package, version), logoutput=False, quiet=False, sudo=True, stderr = subprocess.PIPE) # <conf-selector-tool> can set more than one directory # per package, so return that list, especially for dry_run # > <conf-selector-tool> dry-run-create --package hive-hcatalog --stack-version 2.4.0.0-169 0 # /etc/hive-webhcat/2.4.0.0-169/0 # /etc/hive-hcatalog/2.4.0.0-169/0 created_directories = [] if 0 == code and stdout is not None: # just be sure we have a stdout for line in stdout.splitlines(): created_directories.append(line.rstrip('\n')) # if directories were created, then do some post-processing if not code and stdout and not dry_run: # take care of permissions if directories were created for directory in created_directories: Directory(directory, mode=0755, cd_access='a', create_parents=True) # seed the new directories with configurations from the old (current) directories _seed_new_configuration_directories(package, created_directories) return created_directories
def get_uid(user, return_existing=False): """ Tries to get UID for username. It will try to find UID in custom properties in *cluster_env* and, if *return_existing=True*, it will try to return UID of existing *user*. :param user: username to get UID for :param return_existing: return UID for existing user :return: """ import params user_str = str(user) + "_uid" service_env = [ serviceEnv for serviceEnv in params.config['configurations'] if user_str in params.config['configurations'][serviceEnv] ] if service_env and params.config['configurations'][ service_env[0]][user_str]: service_env_str = str(service_env[0]) uid = params.config['configurations'][service_env_str][user_str] if len(service_env) > 1: Logger.warning( "Multiple values found for %s, using %s" % (user_str, uid)) return uid else: if return_existing: # pick up existing UID or try to find available UID in /etc/passwd, see changeToSecureUid.sh for more info if user == params.smoke_user: return None File( format("{tmp_dir}/changeUid.sh"), content=StaticFile("changeToSecureUid.sh"), mode=0555) code, newUid = shell.call(format("{tmp_dir}/changeUid.sh {user}")) return int(newUid) else: # do not return UID for existing user, used in User resource call to let OS to choose UID for us return None
def bootstrap_standby_namenode(params, use_path=False): bin_path = os.path.join(params.hadoop_bin_dir, '') if use_path else "" try: iterations = 50 bootstrap_cmd = format( "{bin_path}hdfs namenode -bootstrapStandby -nonInteractive") # Blue print based deployments start both NN in parallel and occasionally # the first attempt to bootstrap may fail. Depending on how it fails the # second attempt may not succeed (e.g. it may find the folder and decide that # bootstrap succeeded). The solution is to call with -force option but only # during initial start if params.command_phase == "INITIAL_START": bootstrap_cmd = format( "{bin_path}hdfs namenode -bootstrapStandby -nonInteractive -force" ) Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd)) for i in range(iterations): Logger.info('Try %d out of %d' % (i + 1, iterations)) code, out = shell.call(bootstrap_cmd, logoutput=False, user=params.hdfs_user) if code == 0: Logger.info("Standby namenode bootstrapped successfully") return True elif code == 5: Logger.info("Standby namenode already bootstrapped") return True else: Logger.warning( 'Bootstrap standby namenode failed with %d error code. Will retry' % (code)) except Exception as ex: Logger.error( 'Bootstrap standby namenode threw an exception. Reason %s' % (str(ex))) return False
def _check_nodemanager_startup(): ''' Checks that a NodeManager is in a RUNNING state in the cluster via "yarn node -list -states=RUNNING" command. Once the NodeManager is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :return: ''' import params command = 'yarn node -list -states=RUNNING' try: # 'su - yarn -c "yarn node -status c6401.ambari.apache.org:45454"' return_code, yarn_output = shell.call(command, user=params.hdfs_user) except: raise Fail( 'Unable to determine if the NodeManager has started after upgrade.' ) if return_code == 0: hostname = params.hostname.lower() nodemanager_address = params.nm_address.lower() yarn_output = yarn_output.lower() if hostname in yarn_output or nodemanager_address in yarn_output: Logger.info( 'NodeManager with ID {0} has rejoined the cluster.'.format( nodemanager_address)) return else: raise Fail( 'NodeManager with ID {0} was not found in the list of running NodeManagers' .format(nodemanager_address)) raise Fail( 'Unable to determine if the NodeManager has started after upgrade (result code {0})' .format(str(return_code)))
def _check_datanode_startup(hdfs_binary): """ Checks that a DataNode is reported as being alive via the "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be alive this method will return, otherwise it will raise a Fail(...) and retry automatically. :param hdfs_binary: name/path of the HDFS binary to use :return: """ import params import socket try: dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary) command = dfsadmin_base_command + ' -report -live' return_code, hdfs_output = shell.call(command, user=params.hdfs_user) except: raise Fail( 'Unable to determine if the DataNode has started after upgrade.') if return_code == 0: hostname = params.hostname.lower() hostname_ip = socket.gethostbyname(params.hostname.lower()) if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower( ): Logger.info( "DataNode {0} reports that it has rejoined the cluster.". format(params.hostname)) return else: raise Fail( "DataNode {0} was not found in the list of live DataNodes". format(params.hostname)) # return_code is not 0, fail raise Fail( "Unable to determine if the DataNode has started after upgrade (result code {0})" .format(str(return_code)))
def get_stack_version(package_name): """ @param package_name, name of the package, from which, function will try to get stack version """ stack_selector_path = stack_tools.get_stack_tool_path( stack_tools.STACK_SELECTOR_NAME) if not os.path.exists(stack_selector_path): Logger.info( 'Skipping get_stack_version since " + stack_selector_tool + " is not yet available' ) return None # lazy fail try: command = 'ambari-python-wrap {stack_selector_path} status {package_name}'.format( stack_selector_path=stack_selector_path, package_name=package_name) return_code, stack_output = shell.call(command, timeout=20) except Exception, e: Logger.error(str(e)) raise Fail( 'Unable to execute " + stack_selector_path + " command to retrieve the version.' )
def delete_write_lock_files(): import params if params.security_enabled: kinit_if_needed = format( '{kinit_path_local} {hdfs_principal_name} -kt {hdfs_user_keytab}; ') else: kinit_if_needed = '' hadoop_prefix = format('{kinit_if_needed}hadoop --config {hadoop_conf_dir} dfs') code, output = call(format('{hadoop_prefix} -ls {solr_hdfs_directory}')) collections = get_collection_paths(output) if params.solr_cloud_mode: write_locks_to_delete = get_write_lock_files_solr_cloud(hadoop_prefix, collections) else: write_locks_to_delete = get_write_lock_files_solr_standalone(collections) if len(write_locks_to_delete) > 1: Execute( format('{hadoop_prefix} -rm -f {write_locks_to_delete}'), user=params.hdfs_user )
def check_fs_root(conf_dir, execution_path): import params if not params.manage_hive_fsroot: Logger.info("Skipping fs root check as cluster-env/manage_hive_fsroot is disabled") return if not params.fs_root.startswith("hdfs://"): Logger.info("Skipping fs root check as fs_root does not start with hdfs://") return metatool_cmd = format("hive --config {conf_dir} --service metatool") cmd = as_user(format("{metatool_cmd} -listFSRoot", env={'PATH': execution_path}), params.hive_user) \ + format(" 2>/dev/null | grep hdfs:// | cut -f1,2,3 -d '/' | grep -v '{fs_root}' | head -1") code, out = shell.call(cmd) if code == 0 and out.strip() != "" and params.fs_root.strip() != out.strip(): out = out.strip() cmd = format("{metatool_cmd} -updateLocation {fs_root} {out}") Execute(cmd, user=params.hive_user, environment={'PATH': execution_path} )
def _get_single_version_from_stack_select(): """ Call "<stack-selector> versions" and return the version string if only one version is available. :return: Returns a version string if successful, and None otherwise. """ # Ubuntu returns: "stdin: is not a tty", as subprocess32 output, so must use a temporary file to store the output. tmp_dir = Script.get_tmp_dir() tmp_file = os.path.join(tmp_dir, "copy_tarball_out.txt") stack_version = None out = None stack_selector_path = stack_tools.get_stack_tool_path( stack_tools.STACK_SELECTOR_NAME) get_stack_versions_cmd = "{0} versions > {1}".format( stack_selector_path, tmp_file) try: code, stdoutdata = shell.call(get_stack_versions_cmd, logoutput=True) with open(tmp_file, 'r+') as file: out = file.read() except Exception, e: Logger.logger.exception( "Could not parse output of {0}. Error: {1}".format( str(tmp_file), str(e)))
def check_folder_until_size_not_changes(dir): """ Call du -d 0 <folder> | cut -f 1 on specific directory until the size not changes (so copy operation has finished) """ cmd = format("du -d 0 {dir} | cut -f 1") size_changed = True size_str = "-1" while size_changed: returncode, stdout = call(cmd, user=params.infra_solr_user, timeout=300) if stdout: actual_size_str = stdout.strip() if actual_size_str == size_str: size_changed = False continue else: Logger.info( format( "Actual size of '{dir}' is {actual_size_str}, wait 5 sec and check again, to make sure no copy operation is in progress..." )) time.sleep(5) size_str = actual_size_str
def _chk_writable_mount(self, mount_point): if os.geteuid() == 0: return os.access(mount_point, os.W_OK) else: try: # test if mount point is writable for current user call_result = call( ['test', '-w', mount_point], sudo=True, timeout=int(Hardware.CHECK_REMOTE_MOUNTS_TIMEOUT_DEFAULT) / 2, quiet=not logger.isEnabledFor(logging.DEBUG)) return call_result and call_result[0] == 0 except ExecuteTimeoutException: logger.exception( "Exception happened while checking mount {0}".format( mount_point)) return False except Fail: logger.exception( "Exception happened while checking mount {0}".format( mount_point)) return False
def is_directory_exists_in_HDFS(self, path, as_user): kinit_path_local = get_kinit_path( default('/configurations/kerberos-env/executable_search_paths', None)) kinit_if_needed = format( "{kinit_path_local} -kt {zeppelin_kerberos_keytab} {zeppelin_kerberos_principal};" ) #-d: if the path is a directory, return 0. path_exists = shell.call(format( "{kinit_if_needed} hdfs --config {hadoop_conf_dir} dfs -test -d {path};echo $?" ), user=as_user)[1] # if there is no kerberos setup then the string will contain "-bash: kinit: command not found" if "\n" in path_exists: path_exists = path_exists.split("\n").pop() # '1' means it does not exists if path_exists == '0': return True else: return False
def check_and_copy_notebook_in_hdfs(self, params): if params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir'].startswith("/"): notebook_directory = params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir'] else: notebook_directory = "/user/" + format("{zeppelin_user}") + "/" + \ params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir'] kinit_path_local = get_kinit_path(default('/configurations/kerberos-env/executable_search_paths', None)) kinit_if_needed = format("{kinit_path_local} -kt {zeppelin_kerberos_keytab} {zeppelin_kerberos_principal};") notebook_directory_exists = shell.call(format("{kinit_if_needed} hdfs --config {hadoop_conf_dir} dfs -test -e {notebook_directory};echo $?"), user=params.zeppelin_user)[1] #if there is no kerberos setup then the string will contain "-bash: kinit: command not found" if "\n" in notebook_directory_exists: notebook_directory_exists = notebook_directory_exists.split("\n")[1] # '1' means it does not exists if notebook_directory_exists == '1': # hdfs dfs -mkdir {notebook_directory} params.HdfsResource(format("{notebook_directory}"), type="directory", action="create_on_execute", owner=params.zeppelin_user, recursive_chown=True, recursive_chmod=True ) # hdfs dfs -put /usr/hdp/current/zeppelin-server/notebook/ {notebook_directory} params.HdfsResource(format("{notebook_directory}"), type="directory", action="create_on_execute", source=params.notebook_dir, owner=params.zeppelin_user, recursive_chown=True, recursive_chmod=True )
def get_component_version(stack_name, component_name): """ For any stack name, returns the version currently installed for a given component. Because each stack name may have different logic, the input is a generic dictionary. :param stack_name: one of HDP, HDPWIN, BIGTOP, PHD, etc. usually retrieved from the command-#.json file's ["hostLevelParams"]["stack_name"] :param component_name: Component name as a string necessary to get the version :return: Returns a string if found, e.g., 2.2.1.0-2175, otherwise, returns None """ version = None if stack_name is None or component_name is None: Logger.error("Could not determine component version because of the parameters is empty. " \ "stack_name: %s, component_name: %s" % (str(stack_name), str(component_name))) return version out = None code = -1 if stack_name == "HDP": tmpfile = tempfile.NamedTemporaryFile() get_hdp_comp_version_cmd = "" try: # This is necessary because Ubuntu returns "stdin: is not a tty", see AMBARI-8088 with open(tmpfile.name, 'r') as file: get_hdp_comp_version_cmd = '/usr/bin/hdp-select status %s > %s' % (component_name, tmpfile.name) code, stdoutdata = shell.call(get_hdp_comp_version_cmd) out = file.read() if code != 0 or out is None: raise Exception("Code is nonzero or output is empty") Logger.debug("Command: %s\nOutput: %s" % (get_hdp_comp_version_cmd, str(out))) matches = re.findall(r"([\d\.]+\-\d+)", out) version = matches[0] if matches and len(matches) > 0 else None except Exception, e: Logger.error("Could not determine HDP version for component %s by calling '%s'. Return Code: %s, Output: %s." % (component_name, get_hdp_comp_version_cmd, str(code), str(out)))
def get_check_command(oozie_url, host_name, parameters): security_enabled = False if SECURITY_ENABLED in parameters: security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE' kerberos_env = None if security_enabled: if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters: oozie_keytab = parameters[OOZIE_KEYTAB] oozie_principal = parameters[OOZIE_PRINCIPAL] # substitute _HOST in kerberos principal with actual fqdn oozie_principal = oozie_principal.replace('_HOST', host_name) else: raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.') # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl env = Environment.get_instance() ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid()) kerberos_env = {'KRB5CCNAME': ccache_file} klist_path_local = get_klist_path() klist_command = format("{klist_path_local} -s {ccache_file}") # Determine if we need to kinit by testing to see if the relevant cache exists and has # non-expired tickets. Tickets are marked to expire after 5 minutes to help reduce the number # it kinits we do but recover quickly when keytabs are regenerated return_code, _ = call(klist_command) if return_code != 0: kinit_path_local = get_kinit_path() kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ") # kinit Execute(kinit_command, environment=kerberos_env) command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status") return (command, kerberos_env)
def initialize_ha_zookeeper(params): try: iterations = 10 formatZK_cmd = "hdfs zkfc -formatZK -nonInteractive" Logger.info("Initialize HA state in ZooKeeper: %s" % (formatZK_cmd)) for i in range(iterations): Logger.info('Try %d out of %d' % (i+1, iterations)) code, out = shell.call(formatZK_cmd, logoutput=False, user=params.hdfs_user) if code == 0: Logger.info("HA state initialized in ZooKeeper successfully") return True elif code == 2: Logger.info("HA state already initialized in ZooKeeper") return True # Precondition to starting zkfc is being formatted. # So zkfc being already started means format was already done. elif code == 1 and "zkfc is running as process " in out: Logger.info("HA state already initialized in ZooKeeper, since '{0}'".format(out)) return True else: Logger.warning('HA state initialization in ZooKeeper failed with %d error code. Will retry' % (code)) except Exception as ex: Logger.error('HA state initialization in ZooKeeper threw an exception. Reason %s' %(str(ex))) return False
def kill_zkfc(zkfc_user): """ There are two potential methods for failing over the namenode, especially during a Rolling Upgrade. Option 1. Kill zkfc on primary namenode provided that the secondary is up and has zkfc running on it. Option 2. Silent failover :param zkfc_user: User that started the ZKFC process. :return: Return True if ZKFC was killed, otherwise, false. """ import params if params.dfs_ha_enabled: if params.zkfc_pid_file: check_process = as_user(format("ls {zkfc_pid_file} > /dev/null 2>&1 && ps -p `cat {zkfc_pid_file}` > /dev/null 2>&1"), user=zkfc_user) code, out = shell.call(check_process) if code == 0: Logger.debug("ZKFC is running and will be killed.") kill_command = format("kill -15 `cat {zkfc_pid_file}`") Execute(kill_command, user=zkfc_user ) File(params.zkfc_pid_file, action = "delete", ) return True return False
def prepare_warfile(): """ Invokes the 'prepare-war' command in Oozie in order to create the WAR. The prepare-war command uses the input WAR from ${OOZIE_HOME}/oozie.war and outputs the prepared WAR to ${CATALINA_BASE}/webapps/oozie.war - because of this, both of these environment variables must point to the upgraded oozie-server path and not oozie-client since it was not yet updated. This method will also perform a kinit if necessary. :return: """ import params # get the kerberos token if necessary to execute commands as oozie if params.security_enabled: oozie_principal_with_host = params.oozie_principal.replace("_HOST", params.hostname) command = format("{kinit_path_local} -kt {oozie_keytab} {oozie_principal_with_host}") Execute(command, user=params.oozie_user, logoutput=True) # setup environment environment = { "CATALINA_BASE" : "/usr/hdp/current/oozie-server/oozie-server", "OOZIE_HOME" : "/usr/hdp/current/oozie-server" } # prepare the oozie WAR command = format("{oozie_setup_sh} prepare-war {oozie_secure} -d {oozie_libext_dir}") return_code, oozie_output = shell.call(command, user=params.oozie_user, logoutput=False, quiet=False, env=environment) # set it to "" in to prevent a possible iteration issue if oozie_output is None: oozie_output = "" if return_code != 0 or "New Oozie WAR file with added".lower() not in oozie_output.lower(): message = "Unexpected Oozie WAR preparation output {0}".format(oozie_output) Logger.error(message) raise Fail(message)
def create(stack_name, package, version, dry_run=False): """ Creates a config version for the specified package :param stack_name: the name of the stack :param package: the name of the package, as-used by conf-select :param version: the version number to create :return List of directories created """ Logger.info( "Checking if need to create versioned conf dir /etc/{0}/{1}/0".format( package, version)) if not _valid(stack_name, package, version): Logger.info("Will not create it since parameters are not valid.") return [] command = "dry-run-create" if dry_run else "create-conf-dir" code, stdout, stderr = shell.call(get_cmd(command, package, version), logoutput=False, quiet=False, sudo=True, stderr=subprocess.PIPE) # conf-select can set more than one directory # per package, so return that list, especially for dry_run dirs = [] if 0 == code and stdout is not None: # just be sure we have a stdout for line in stdout.splitlines(): dirs.append(line.rstrip('\n')) # take care of permissions if not code and stdout and command == "create-conf-dir": for d in dirs: Directory(d, mode=0755, cd_access='a', create_parents=True) return dirs
def get_uid(user): import params user_str = str(user) + "_uid" service_env = [ serviceEnv for serviceEnv in params.config['configurations'] if user_str in params.config['configurations'][serviceEnv] ] if service_env and params.config['configurations'][ service_env[0]][user_str]: service_env_str = str(service_env[0]) uid = params.config['configurations'][service_env_str][user_str] if len(service_env) > 1: Logger.warning("Multiple values found for %s, using %s" % (user_str, uid)) return uid else: if user == params.smoke_user: return None File(format("{tmp_dir}/changeUid.sh"), content=StaticFile("changeToSecureUid.sh"), mode=0555) code, newUid = shell.call(format("{tmp_dir}/changeUid.sh {user}")) return int(newUid)
def locales(self): code, out = shell.call("locale -a") return out.strip().split("\n")
def _check_existence(self, name): code, out = shell.call(CHECK_CMD % name) return not bool(code)
def machine(self): code, out = shell.call(["/bin/uname", "-m"]) return out.strip()
def initiate_safe_zkfc_failover(): """ If this is the active namenode, initiate a safe failover and wait for it to become the standby. If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart, will also have to start ZKFC manually. """ import params # Must kinit before running the HDFS command if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user=params.hdfs_user) active_namenode_id = None standby_namenode_id = None active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states( params.hdfs_site, params.security_enabled, params.hdfs_user) if active_namenodes: active_namenode_id = active_namenodes[0][0] if standby_namenodes: standby_namenode_id = standby_namenodes[0][0] if active_namenode_id: Logger.info(format("Active NameNode id: {active_namenode_id}")) if standby_namenode_id: Logger.info(format("Standby NameNode id: {standby_namenode_id}")) if unknown_namenodes: for unknown_namenode in unknown_namenodes: Logger.info("NameNode HA state for {0} is unknown".format( unknown_namenode[0])) if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id: # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover) Logger.info( format( "NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby" )) failover_command = format( "hdfs haadmin -ns {dfs_ha_nameservices} -failover {namenode_id} {other_namenode_id}" ) check_standby_cmd = format( "hdfs haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep standby" ) msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format( params.hostname) Logger.info(msg) code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True) Logger.info( format("Rolling Upgrade - failover command returned {code}")) wait_for_standby = False if code == 0: wait_for_standby = True else: # Try to kill ZKFC manually was_zkfc_killed = kill_zkfc(params.hdfs_user) code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True) Logger.info( format("Rolling Upgrade - check for standby returned {code}")) if code == 255 and out: Logger.info("Rolling Upgrade - NameNode is already down.") else: if was_zkfc_killed: # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover. wait_for_standby = True if wait_for_standby: Logger.info("Waiting for this NameNode to become the standby one.") Execute(check_standby_cmd, user=params.hdfs_user, tries=50, try_sleep=6, logoutput=True) else: msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format( params.hostname) Logger.info(msg)
def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, env=None): if action is None: raise Fail('"action" parameter is required for function namenode().') if action in ["start", "stop"] and hdfs_binary is None: raise Fail( '"hdfs_binary" parameter is required for function namenode().') if action == "configure": import params #we need this directory to be present before any action(HA manual steps for #additional namenode) create_name_dirs(params.dfs_name_dir) elif action == "start": Logger.info("Called service {0} with upgrade_type: {1}".format( action, str(upgrade_type))) setup_ranger_hdfs(upgrade_type=upgrade_type) import params if do_format: format_namenode() pass File(params.exclude_file_path, content=Template("exclude_hosts_list.j2"), owner=params.hdfs_user, group=params.user_group) if params.dfs_ha_enabled and \ params.dfs_ha_namenode_standby is not None and \ params.hostname == params.dfs_ha_namenode_standby: # if the current host is the standby NameNode in an HA deployment # run the bootstrap command, to start the NameNode in standby mode # this requires that the active NameNode is already up and running, # so this execute should be re-tried upon failure, up to a timeout success = bootstrap_standby_namenode(params) if not success: raise Fail("Could not bootstrap standby namenode") if upgrade_type == "rolling" and params.dfs_ha_enabled: # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried # to kill ZKFC manually, so we need to start it if not already running. safe_zkfc_op(action, env) options = "" if upgrade_type == "rolling": options = "-rollingUpgrade started" elif upgrade_type == "nonrolling": is_previous_image_dir = is_previous_fs_image() Logger.info( format( "Previous file system image dir present is {is_previous_image_dir}" )) if params.dfs_ha_enabled: if params.desired_namenode_role is None: raise Fail( "Did not receive parameter \"desired_namenode_role\" to indicate the role that this NameNode should have." ) if params.desired_namenode_role == "active": # The "-upgrade" command can only be used exactly once. If used more than once during a retry, it will cause problems. options = "" if is_previous_image_dir else "-upgrade" if params.desired_namenode_role == "standby": options = "-bootstrapStandby -force" else: # Both Primary and Secondary NameNode can use the same command. options = "" if is_previous_image_dir else "-upgrade" Logger.info(format("Option for start command: {options}")) service(action="start", name="namenode", user=params.hdfs_user, options=options, create_pid_dir=True, create_log_dir=True) if params.security_enabled: Execute(format( "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}" ), user=params.hdfs_user) is_namenode_safe_mode_off = format( "{hdfs_binary} dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'" ) if params.dfs_ha_enabled: is_active_namenode_cmd = as_user(format( "{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active" ), params.hdfs_user, env={ 'PATH': params.hadoop_bin_dir }) else: is_active_namenode_cmd = False # During NonRolling Upgrade, both NameNodes are initially down, # so no point in checking if this is the active or standby. if upgrade_type == "nonrolling": is_active_namenode_cmd = False # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | # HA and standby | no change | no check | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: # During normal operations, the NameNode is expected to be up. code, out = shell.call( is_active_namenode_cmd, logoutput=True) # If active NN, code will be 0 if code == 0: # active check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." else: msg = "Will remain in the current safemode state." else: msg = "Must wait to leave safemode since High Availability is not enabled." check_for_safemode_off = True Logger.info(msg) # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down. stay_in_safe_mode = False if upgrade_type == "nonrolling": stay_in_safe_mode = True if check_for_safemode_off: Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) if not stay_in_safe_mode: Logger.info( "Wait to leafe safemode since must transition from ON to OFF." ) try: # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=180, try_sleep=10, user=params.hdfs_user, logoutput=True) except Fail: Logger.error( "NameNode is still in safemode, please be careful with commands that need safemode OFF." ) # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) elif action == "stop": import params service(action="stop", name="namenode", user=params.hdfs_user) elif action == "status": import status_params check_process_status(status_params.namenode_pid_file) elif action == "decommission": decommission()
def curl_krb_request(tmp_dir, keytab, principal, url, cache_file_prefix, krb_exec_search_paths, return_only_http_code, caller_label, user, connection_timeout=CONNECTION_TIMEOUT_DEFAULT, ca_certs=None, kinit_timer_ms=DEFAULT_KERBEROS_KINIT_TIMER_MS, method='', body='', header=''): """ Makes a curl request using the kerberos credentials stored in a calculated cache file. The cache file is created by combining the supplied principal, keytab, user, and request name into a unique hash. This function will use the klist command to determine if the cache is expired and will perform a kinit if necessary. Additionally, it has an internal timer to force a kinit after a configurable amount of time. This is to prevent boundary issues where requests hit the edge of a ticket's lifetime. :param tmp_dir: the directory to use for storing the local kerberos cache for this request. :param keytab: the location of the keytab to use when performing a kinit :param principal: the principal to use when performing a kinit :param url: the URL to request :param cache_file_prefix: an identifier used to build the unique cache name for this request. This ensures that multiple requests can use the same cache. :param krb_exec_search_paths: the search path to use for invoking kerberos binaries :param return_only_http_code: True to return only the HTTP code, False to return GET content :param caller_label: an identifier to give context into the caller of this module (used for logging) :param user: the user to invoke the curl command as :param connection_timeout: if specified, a connection timeout for curl (default 10 seconds) :param ca_certs: path to certificates :param kinit_timer_ms: if specified, the time (in ms), before forcing a kinit even if the klist cache is still valid. :return: """ import uuid # backward compatibility with old code and management packs, etc. All new code need pass ca_certs explicitly if ca_certs is None: try: from ambari_agent.AmbariConfig import AmbariConfig ca_certs = AmbariConfig.get_resolved_config( ).get_ca_cert_file_path() except: pass # start off false is_kinit_required = False # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing curl. Use the md5 hash of the combination of the principal and keytab file # to generate a (relatively) unique cache filename so that we can use it as needed. Scope # this file by user in order to prevent sharing of cache files by multiple users. ccache_file_name = _md5("{0}|{1}".format(principal, keytab)).hexdigest() curl_krb_cache_path = os.path.join(tmp_dir, "curl_krb_cache") if not os.path.exists(curl_krb_cache_path): os.makedirs(curl_krb_cache_path) os.chmod(curl_krb_cache_path, 01777) ccache_file_path = "{0}{1}{2}_{3}_cc_{4}".format(curl_krb_cache_path, os.sep, cache_file_prefix, user, ccache_file_name) kerberos_env = {'KRB5CCNAME': ccache_file_path} # concurrent kinit's can cause the following error: # Internal credentials cache error while storing credentials while getting initial credentials kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS) kinit_lock.acquire() try: # If there are no tickets in the cache or they are expired, perform a kinit, else use what # is in the cache if krb_exec_search_paths: klist_path_local = get_klist_path(krb_exec_search_paths) else: klist_path_local = get_klist_path() # take a look at the last time kinit was run for the specified cache and force a new # kinit if it's time; this helps to avoid problems approaching ticket boundary when # executing a klist and then a curl last_kinit_time = _KINIT_CACHE_TIMES.get(ccache_file_name, 0) current_time = long(time.time()) if current_time - kinit_timer_ms > last_kinit_time: is_kinit_required = True # if the time has not expired, double-check that the cache still has a valid ticket if not is_kinit_required: klist_command = "{0} -s {1}".format(klist_path_local, ccache_file_path) is_kinit_required = (shell.call(klist_command, user=user)[0] != 0) # if kinit is required, the perform the kinit if is_kinit_required: if krb_exec_search_paths: kinit_path_local = get_kinit_path(krb_exec_search_paths) else: kinit_path_local = get_kinit_path() logger.debug( "Enabling Kerberos authentication for %s via GSSAPI using ccache at %s", caller_label, ccache_file_path) # kinit; there's no need to set a ticket timeout as this will use the default invalidation # configured in the krb5.conf - regenerating keytabs will not prevent an existing cache # from working correctly shell.checked_call("{0} -c {1} -kt {2} {3} > /dev/null".format( kinit_path_local, ccache_file_path, keytab, principal), user=user) # record kinit time _KINIT_CACHE_TIMES[ccache_file_name] = current_time else: # no kinit needed, use the cache logger.debug( "Kerberos authentication for %s via GSSAPI already enabled using ccache at %s.", caller_label, ccache_file_path) finally: kinit_lock.release() # check if cookies dir exists, if not then create it cookies_dir = os.path.join(tmp_dir, "cookies") if not os.path.exists(cookies_dir): os.makedirs(cookies_dir) cookie_file_name = str(uuid.uuid4()) cookie_file = os.path.join(cookies_dir, cookie_file_name) start_time = time.time() error_msg = None # setup timeouts for the request; ensure we use integers since that is what curl needs connection_timeout = int(connection_timeout) maximum_timeout = connection_timeout + 2 ssl_options = ['-k'] if ca_certs: ssl_options = ['--cacert', ca_certs] try: if return_only_http_code: _, curl_stdout, curl_stderr = get_user_call_output( ['curl', '--location-trusted'] + ssl_options + [ '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, '-w', '%{http_code}', url, '--connect-timeout', str(connection_timeout), '--max-time', str(maximum_timeout), '-o', '/dev/null' ], user=user, env=kerberos_env) else: curl_command = ['curl', '--location-trusted'] + ssl_options + [ '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file, url, '--connect-timeout', str(connection_timeout), '--max-time', str(maximum_timeout) ] # returns response body if len(method) > 0 and len(body) == 0 and len(header) == 0: curl_command.extend(['-X', method]) elif len(method) > 0 and len(body) == 0 and len(header) > 0: curl_command.extend(['-H', header, '-X', method]) elif len(method) > 0 and len(body) > 0 and len(header) == 0: curl_command.extend(['-X', method, '-d', body]) elif len(method) > 0 and len(body) > 0 and len(header) > 0: curl_command.extend(['-H', header, '-X', method, '-d', body]) _, curl_stdout, curl_stderr = get_user_call_output( curl_command, user=user, env=kerberos_env) except Fail: if logger.isEnabledFor(logging.DEBUG): logger.exception( "Unable to make a curl request for {0}.".format(caller_label)) raise finally: if os.path.isfile(cookie_file): os.remove(cookie_file) # empty quotes evaluates to false if curl_stderr: error_msg = curl_stderr time_millis = time.time() - start_time # empty quotes evaluates to false if curl_stdout: if return_only_http_code: return (int(curl_stdout), error_msg, time_millis) else: return (curl_stdout, error_msg, time_millis) logger.debug("The curl response for %s is empty; standard error = %s", caller_label, str(error_msg)) return ("", error_msg, time_millis)
def rebalancehdfs(self, env): import params env.set_params(params) name_node_parameters = json.loads(params.name_node_params) threshold = name_node_parameters['threshold'] _print("Starting balancer with threshold = %s\n" % threshold) rebalance_env = {'PATH': params.hadoop_bin_dir} if params.security_enabled: # Create the kerberos credentials cache (ccache) file and set it in the environment to use # when executing HDFS rebalance command. Use the md5 hash of the combination of the principal and keytab file # to generate a (relatively) unique cache filename so that we can use it as needed. # TODO: params.tmp_dir=/var/lib/ambari-agent/tmp. However hdfs user doesn't have access to this path. # TODO: Hence using /tmp ccache_file_name = "hdfs_rebalance_cc_" + _md5( format( "{hdfs_principal_name}|{hdfs_user_keytab}")).hexdigest() ccache_file_path = os.path.join(tempfile.gettempdir(), ccache_file_name) rebalance_env['KRB5CCNAME'] = ccache_file_path # If there are no tickets in the cache or they are expired, perform a kinit, else use what # is in the cache klist_cmd = format("{klist_path_local} -s {ccache_file_path}") kinit_cmd = format( "{kinit_path_local} -c {ccache_file_path} -kt {hdfs_user_keytab} {hdfs_principal_name}" ) if shell.call(klist_cmd, user=params.hdfs_user)[0] != 0: Execute(kinit_cmd, user=params.hdfs_user) def calculateCompletePercent(first, current): # avoid division by zero try: division_result = current.bytesLeftToMove / first.bytesLeftToMove except ZeroDivisionError: Logger.warning( "Division by zero. Bytes Left To Move = {0}. Return 1.0". format(first.bytesLeftToMove)) return 1.0 return 1.0 - division_result def startRebalancingProcess(threshold, rebalance_env): rebalanceCommand = format( 'hdfs --config {hadoop_conf_dir} balancer -threshold {threshold}' ) return as_user(rebalanceCommand, params.hdfs_user, env=rebalance_env) command = startRebalancingProcess(threshold, rebalance_env) basedir = os.path.join(env.config.basedir, 'scripts') if (threshold == 'DEBUG'): #FIXME TODO remove this on PROD basedir = os.path.join(env.config.basedir, 'scripts', 'balancer-emulator') command = ['ambari-python-wrap', 'hdfs-command.py'] _print("Executing command %s\n" % command) parser = hdfs_rebalance.HdfsParser() def handle_new_line(line, is_stderr): if is_stderr: return _print('[balancer] %s' % (line)) pl = parser.parseLine(line) if pl: res = pl.toJson() res['completePercent'] = calculateCompletePercent( parser.initialLine, pl) self.put_structured_out(res) elif parser.state == 'PROCESS_FINISED': _print('[balancer] %s' % ('Process is finished')) self.put_structured_out({'completePercent': 1}) return Execute( command, on_new_line=handle_new_line, logoutput=False, ) if params.security_enabled: # Delete the kerberos credentials cache (ccache) file File( ccache_file_path, action="delete", )