Exemplo n.º 1
0
def is_active_namenode(hdfs_binary):
  """
  Checks if current NameNode is active. Waits up to 30 seconds. If other NameNode is active returns False.
  :return: True if current NameNode is active, False otherwise
  """
  import params

  if params.dfs_ha_enabled:
    is_active_this_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})
    is_active_other_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {other_namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir})

    for i in range(0, 5):
      code, out = shell.call(is_active_this_namenode_cmd) # If active NN, code will be 0
      if code == 0: # active
        return True

      code, out = shell.call(is_active_other_namenode_cmd) # If other NN is active, code will be 0
      if code == 0: # other NN is active
        return False

      if i < 4: # Do not sleep after last iteration
        time.sleep(6)

    Logger.info("Active NameNode is not found.")
    return False

  else:
    return True
Exemplo n.º 2
0
def get_write_lock_files_solr_cloud(hadoop_prefix, collections):
    import params

    write_locks_to_delete = ''

    for collection_path in collections:
        code, output = call(format('{hadoop_prefix} -ls {collection_path}'))
        core_paths = get_core_paths(output, collection_path)

        collection_name = collection_path.replace(format('{solr_hdfs_directory}/'), '')
        zk_code, zk_output = call(format(
            '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}/collections/{collection_name}/state.json'),
            env={'JAVA_HOME': params.java64_home},
            timeout=60
        )
        if zk_code != 0:
            Logger.error(format('Cannot determine cores owned by [{solr_hostname}] in collection [{collection_name}] due to ZK error.'))
            continue

        for core_path in core_paths:
            core_node_name = core_path.replace(format('{collection_path}/'), '')
            pattern = re.compile(format(HOSTNAME_VERIFIER_PATTERN), re.MULTILINE|re.DOTALL)
            core_on_hostname = re.search(pattern, zk_output)
            if core_on_hostname is not None:
                write_locks_to_delete += WRITE_LOCK_PATTERN.format(core_path)

    return write_locks_to_delete
  def post_upgrade_restart(self, env, upgrade_type=None):
    if upgrade_type == "nonrolling":
      return

    Logger.info("Executing Stack Upgrade post-restart")
    import params
    env.set_params(params)
    zk_server_host = random.choice(params.zookeeper_hosts)
    cli_shell = format("{zk_cli_shell} -server {zk_server_host}:{client_port}")
    # Ensure that a quorum is still formed.
    unique = get_unique_id_and_date()
    create_command = format("echo 'create /{unique} mydata' | {cli_shell}")
    list_command = format("echo 'ls /' | {cli_shell}")
    delete_command = format("echo 'delete /{unique} ' | {cli_shell}")

    quorum_err_message = "Failed to establish zookeeper quorum"
    call_and_match_output(create_command, 'Created', quorum_err_message, user=params.zk_user)
    call_and_match_output(list_command, r"\[.*?" + unique + ".*?\]", quorum_err_message, user=params.zk_user)
    shell.call(delete_command, user=params.zk_user)

    if params.client_port:
      check_leader_command = format("echo stat | nc localhost {client_port} | grep Mode")
      code, out = shell.call(check_leader_command, logoutput=False)
      if code == 0 and out:
        Logger.info(out)
Exemplo n.º 4
0
  def actionexecute(self, env):
    config = Script.get_config()

    version = default('/commandParams/version', None)
    stack_name = default('/hostLevelParams/stack_name', "")

    if not version:
      raise Fail("Value is required for '/commandParams/version'")
  
    # other os?
    if OSCheck.is_redhat_family():
      cmd = ('/usr/bin/yum', 'clean', 'all')
      code, out = shell.call(cmd, sudo=True)

    min_ver = format_hdp_stack_version("2.2")
    real_ver = format_hdp_stack_version(version)
    if stack_name == "HDP":
      if compare_versions(real_ver, min_ver) >= 0:
        cmd = ('hdp-select', 'set', 'all', version)
        code, out = shell.call(cmd, sudo=True)

      if compare_versions(real_ver, format_hdp_stack_version("2.3")) >= 0:
        # backup the old and symlink /etc/[component]/conf to /usr/hdp/current/[component]
        for k, v in conf_select.PACKAGE_DIRS.iteritems():
          for dir_def in v:
            link_config(dir_def['conf_dir'], dir_def['current_dir'])
Exemplo n.º 5
0
def prepare_rpcbind():
    Logger.info("check if native nfs server is running")
    p, output = shell.call("pgrep nfsd")
    if p == 0:
        Logger.info("native nfs server is running. shutting it down...")
        # shutdown nfs
        shell.call("service nfs stop")
        shell.call("service nfs-kernel-server stop")
        Logger.info("check if the native nfs server is down...")
        p, output = shell.call("pgrep nfsd")
        if p == 0:
            raise Fail("Failed to shutdown native nfs service")

    Logger.info("check if rpcbind or portmap is running")
    p, output = shell.call("pgrep rpcbind")
    q, output = shell.call("pgrep portmap")

    if p != 0 and q != 0:
        Logger.info("no portmap or rpcbind running. starting one...")
        p, output = shell.call(("service", "rpcbind", "start"), sudo=True)
        q, output = shell.call(("service", "portmap", "start"), sudo=True)
        if p != 0 and q != 0:
            raise Fail("Failed to start rpcbind or portmap")

    Logger.info("now we are ready to start nfs gateway")
Exemplo n.º 6
0
 def _check_existence(self, name):
   code, out = shell.call(CHECK_CMD % name)
   if bool(code):
     return False
   elif '*' in name or '?' in name:  # Check if all packages matching pattern are installed
     code1, out1 = shell.call(GET_NOT_INSTALLED_CMD % name)
     return NO_PACKAGES_FOUND_STATUS in out1.splitlines()
   else:
     return True
Exemplo n.º 7
0
def initiate_safe_zkfc_failover():
  """
  If this is the active namenode, initiate a safe failover and wait for it to become the standby.

  If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart,
  will also have to start ZKFC manually.
  """
  import params

  # Must kinit before running the HDFS command
  if params.security_enabled:
    Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
            user = params.hdfs_user)

  check_service_cmd = format("hdfs haadmin -getServiceState {namenode_id}")
  code, out = shell.call(check_service_cmd, logoutput=True, user=params.hdfs_user)

  original_state = "unknown"
  if code == 0 and out:
    original_state = "active" if "active" in out else ("standby" if "standby" in out else original_state)
    Logger.info("Namenode service state: %s" % original_state)

    if original_state == "active":
      msg = "Rolling Upgrade - Initiating a ZKFC failover on {0} NameNode host {1}.".format(original_state, params.hostname)
      Logger.info(msg)

      check_standby_cmd = format("hdfs haadmin -getServiceState {namenode_id} | grep standby")
      failover_command = format("hdfs haadmin -failover {namenode_id} {other_namenode_id}")

      code, out = shell.call(failover_command, user=params.hdfs_user, logoutput=True)
      Logger.info(format("Rolling Upgrade - failover command returned {code}"))
      wait_for_standby = False

      if code == 0:
        wait_for_standby = True
      else:
        # Try to kill ZKFC manually
        was_zkfc_killed = kill_zkfc(params.hdfs_user)
        code, out = shell.call(check_standby_cmd, user=params.hdfs_user, logoutput=True)
        Logger.info(format("Rolling Upgrade - check for standby returned {code}"))

        if code == 255 and out:
          Logger.info("Rolling Upgrade - namenode is already down.")
        else:
          if was_zkfc_killed:
            # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
            wait_for_standby = True

      if wait_for_standby:
        Logger.info("Waiting for this NameNode to become the standby one.")
        Execute(check_standby_cmd,
                user=params.hdfs_user,
                tries=50,
                try_sleep=6,
                logoutput=True)
  else:
    raise Fail("Unable to determine NameNode HA states by calling command: {0}".format(check_service_cmd))
Exemplo n.º 8
0
 def _check_existence(self, name):
     if "." in name:  # To work with names like 'zookeeper_2_2_1_0_2072.noarch'
         name = os.path.splitext(name)[0]
     code, out = shell.call(CHECK_CMD % name)
     if bool(code):
         return False
     elif "*" in name or "?" in name:  # Check if all packages matching pattern are installed
         code1, out1 = shell.call(CHECK_AVAILABLE_PACKAGES_CMD % name)
         return not bool(code1)
     else:
         return True
Exemplo n.º 9
0
 def _init_cmd(self, command):
   if self._upstart:
     if command == "status":
       ret,out = shell.call(["/sbin/" + command, self.resource.service_name])
       _proc, state = out.strip().split(' ', 1)
       ret = 0 if state != "stop/waiting" else 1
     else:
       ret,out = shell.call(["/sbin/" + command, self.resource.service_name])
   else:
     ret,out = shell.call(["/etc/init.d/%s" % self.resource.service_name, command])
   return ret,out
Exemplo n.º 10
0
  def action_run(self):
    from tempfile import NamedTemporaryFile

    Logger.info("Running script %s" % self.resource)
    with NamedTemporaryFile(prefix="resource_management-script", bufsize=0) as tf:
      tf.write(self.resource.code)
      tf.flush()

      _ensure_metadata(tf.name, self.resource.user, self.resource.group)
      shell.call([self.resource.interpreter, tf.name],
                      cwd=self.resource.cwd, env=self.resource.environment,
                      preexec_fn=_preexec_fn(self.resource))
Exemplo n.º 11
0
 def _check_existence(self, name):
   code, out = shell.call(CHECK_EXISTENCE_CMD % name)
   if bool(code):
     return False
   elif '*' in name or '.' in name:  # Check if all packages matching regexp are installed
     code1, out1 = shell.call(GET_PACKAGES_BY_PATTERN_CMD % name)
     for package_name in out1.splitlines():
       code2, out2 = shell.call(GET_PACKAGE_STATUS_CMD % package_name)
       if PACKAGE_INSTALLED_STATUS not in out2.splitlines():
         return False
     return True
   else:
     return True
Exemplo n.º 12
0
def setup_solr_cloud():
    import params

    code, output = call(
            format(
                    '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}{clusterstate_json}'
            ),
            env={'JAVA_HOME': params.java64_home},
            timeout=60
    )

    if not ("NoNodeException" in output):
        Logger.info(
                format(
                        "ZK node {solr_cloud_zk_directory}{clusterstate_json} already exists, skipping ..."
                )
        )
        return

    Execute(
            format(
                    '{zk_client_prefix} -cmd makepath {solr_cloud_zk_directory}'
            ),
            environment={'JAVA_HOME': params.java64_home},
            ignore_failures=True,
            user=params.solr_config_user
    )
Exemplo n.º 13
0
def kill_zkfc(zkfc_user):
  """
  There are two potential methods for failing over the namenode, especially during a Rolling Upgrade.
  Option 1. Kill zkfc on primary namenode provided that the secondary is up and has zkfc running on it.
  Option 2. Silent failover (not supported as of HDP 2.2.0.0)
  :param zkfc_user: User that started the ZKFC process.
  :return: Return True if ZKFC was killed, otherwise, false.
  """
  import params
  if params.dfs_ha_enabled:
    zkfc_pid_file = get_service_pid_file("zkfc", zkfc_user)
    if zkfc_pid_file:
      check_process = as_user(format("ls {zkfc_pid_file} > /dev/null 2>&1 && ps -p `cat {zkfc_pid_file}` > /dev/null 2>&1"), user=zkfc_user)
      code, out = shell.call(check_process)
      if code == 0:
        Logger.debug("ZKFC is running and will be killed.")
        kill_command = format("kill -15 `cat {zkfc_pid_file}`")
        Execute(kill_command,
             user=zkfc_user
        )
        File(zkfc_pid_file,
             action = "delete",
        )
        return True
  return False
Exemplo n.º 14
0
def create(stack_name, package, version, dry_run = False):
  """
  Creates a config version for the specified package
  :param stack_name: the name of the stack
  :param package: the name of the package, as-used by conf-select
  :param version: the version number to create
  :return List of directories created
  """
  Logger.info("Checking if need to create versioned conf dir /etc/{0}/{1}/0".format(package, version))
  if not _valid(stack_name, package, version):
    Logger.info("Will not create it since parameters are not valid.")
    return []

  command = "dry-run-create" if dry_run else "create-conf-dir"

  code, stdout, stderr = shell.call(get_cmd(command, package, version), logoutput=False, quiet=False, sudo=True, stderr = subprocess.PIPE)

  # conf-select can set more than one directory
  # per package, so return that list, especially for dry_run
  dirs = []
  if 0 == code and stdout is not None: # just be sure we have a stdout
    for line in stdout.splitlines():
      dirs.append(line.rstrip('\n'))

  # take care of permissions
  if not code and stdout and command == "create-conf-dir":
    for d in dirs:
      Directory(d,
          mode=0755,
          cd_access='a',
          recursive=True)

  return dirs
def pre_rolling_upgrade_shutdown(hdfs_binary):
  """
  Runs the "shutdownDatanode {ipc_address} upgrade" command to shutdown the
  DataNode in preparation for an upgrade. This will then periodically check
  "getDatanodeInfo" to ensure the DataNode has shutdown correctly.
  This function will obtain the Kerberos ticket if security is enabled.
  :param hdfs_binary: name/path of the HDFS binary to use
  :return: Return True if ran ok (even with errors), and False if need to stop the datanode forcefully.
  """
  import params

  Logger.info('DataNode executing "shutdownDatanode" command in preparation for upgrade...')
  if params.security_enabled:
    Execute(params.dn_kinit_cmd, user = params.hdfs_user)

  dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary)
  command = format('{dfsadmin_base_command} -shutdownDatanode {dfs_dn_ipc_address} upgrade')

  code, output = shell.call(command, user=params.hdfs_user)
  if code == 0:
    # verify that the datanode is down
    _check_datanode_shutdown(hdfs_binary)
  else:
    # Due to bug HDFS-7533, DataNode may not always shutdown during stack upgrade, and it is necessary to kill it.
    if output is not None and re.search("Shutdown already in progress", output):
      Logger.error("Due to a known issue in DataNode, the command {0} did not work, so will need to shutdown the datanode forcefully.".format(command))
      return False
  return True
Exemplo n.º 16
0
def get_hdp_version():
  try:
    command = 'hdp-select status hadoop-client'
    return_code, hdp_output = shell.call(command, timeout=20)
  except Exception, e:
    Logger.error(str(e))
    raise Fail('Unable to execute hdp-select command to retrieve the version.')
Exemplo n.º 17
0
 def status(self, env):    
   cmd = 'ps -ef | grep proc_rangerkms | grep -v grep'
   code, output = shell.call(cmd, timeout=20)
   if code != 0:
     Logger.debug('KMS process not running')
     raise ComponentIsNotRunning()
   pass
def remove_solr_ssl_support():
    import params

    if not params.solr_cloud_mode:
        return

    code, output = call(
            format(
                    '{zk_client_prefix} -cmd get {solr_cloud_zk_directory}{clusterprops_json}'
            ),
            env={'JAVA_HOME': params.java64_home},
            timeout=60
    )

    if "NoNodeException" in output:
        return

    Execute(
            format(
                    '{zk_client_prefix} -cmd clear {solr_cloud_zk_directory}{clusterprops_json}'
            ),
            environment={'JAVA_HOME': params.java64_home},
            ignore_failures=True,
            user=params.solr_config_user
    )
def _get_current_hiveserver_version():
  """
  Runs "hive --version" and parses the result in order
  to obtain the current version of hive.

  :return:  the hiveserver2 version, returned by "hive --version"
  """
  import params

  try:
    # When downgrading the source version should be the version we are downgrading from
    if "downgrade" == params.upgrade_direction:
      if not params.downgrade_from_version:
        raise Fail('The version from which we are downgrading from should be provided in \'downgrade_from_version\'')
      source_version = params.downgrade_from_version
    else:
      source_version = params.current_version
    hive_execute_path = _get_hive_execute_path(source_version)
    version_hive_bin = params.hive_bin
    formatted_source_version = format_hdp_stack_version(source_version)
    if formatted_source_version and compare_versions(formatted_source_version, "2.2") >= 0:
      version_hive_bin = format('/usr/hdp/{source_version}/hive/bin')
    command = format('{version_hive_bin}/hive --version')
    return_code, hdp_output = shell.call(command, user=params.hive_user, path=hive_execute_path)
  except Exception, e:
    Logger.error(str(e))
    raise Fail('Unable to execute hive --version command to retrieve the hiveserver2 version.')
Exemplo n.º 20
0
 def get_hdp_version():
   if not options.hdp_version:
     # Ubuntu returns: "stdin: is not a tty", as subprocess output.
     tmpfile = tempfile.NamedTemporaryFile()
     out = None
     with open(tmpfile.name, 'r+') as file:
       get_hdp_version_cmd = '/usr/bin/hdp-select status %s > %s' % ('hadoop-mapreduce-historyserver', tmpfile.name)
       code, stdoutdata = shell.call(get_hdp_version_cmd)
       out = file.read()
     pass
     if code != 0 or out is None:
       Logger.warning("Could not verify HDP version by calling '%s'. Return Code: %s, Output: %s." %
                      (get_hdp_version_cmd, str(code), str(out)))
       return 1
    
     matches = re.findall(r"([\d\.]+\-\d+)", out)
     hdp_version = matches[0] if matches and len(matches) > 0 else None
    
     if not hdp_version:
       Logger.error("Could not parse HDP version from output of hdp-select: %s" % str(out))
       return 1
   else:
     hdp_version = options.hdp_version
     
   return hdp_version
def _check_datanode_startup(hdfs_binary):
  """
  Checks that a DataNode is reported as being alive via the
  "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be
  alive this method will return, otherwise it will raise a Fail(...) and retry
  automatically.
  :param hdfs_binary: name/path of the HDFS binary to use
  :return:
  """
  import params
  import socket

  try:
    dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary)
    command = dfsadmin_base_command + ' -report -live'
    return_code, hdfs_output = shell.call(command, user=params.hdfs_user)
  except:
    raise Fail('Unable to determine if the DataNode has started after upgrade.')

  if return_code == 0:
    hostname = params.hostname.lower()
    hostname_ip =  socket.gethostbyname(params.hostname.lower())
    if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower():
      Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname))
      return
    else:
      raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname))

  # return_code is not 0, fail
  raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
Exemplo n.º 22
0
def bootstrap_standby_namenode(params, use_path=False):

  bin_path = os.path.join(params.hadoop_bin_dir, '') if use_path else ""

  try:
    iterations = 50
    bootstrap_cmd = format("{bin_path}hdfs namenode -bootstrapStandby -nonInteractive")
    # Blue print based deployments start both NN in parallel and occasionally
    # the first attempt to bootstrap may fail. Depending on how it fails the
    # second attempt may not succeed (e.g. it may find the folder and decide that
    # bootstrap succeeded). The solution is to call with -force option but only
    # during initial start
    if params.command_phase == "INITIAL_START":
      bootstrap_cmd = format("{bin_path}hdfs namenode -bootstrapStandby -nonInteractive -force")
    Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd))
    for i in range(iterations):
      Logger.info('Try %d out of %d' % (i+1, iterations))
      code, out = shell.call(bootstrap_cmd, logoutput=False, user=params.hdfs_user)
      if code == 0:
        Logger.info("Standby namenode bootstrapped successfully")
        return True
      elif code == 5:
        Logger.info("Standby namenode already bootstrapped")
        return True
      else:
        Logger.warning('Bootstrap standby namenode failed with %d error code. Will retry' % (code))
  except Exception as ex:
    Logger.error('Bootstrap standby namenode threw an exception. Reason %s' %(str(ex)))
  return False
Exemplo n.º 23
0
def _check_datanode_startup():
  """
  Checks that a DataNode is reported as being alive via the
  "hdfs dfsadmin -report -live" command. Once the DataNode is found to be
  alive this method will return, otherwise it will raise a Fail(...) and retry
  automatically.
  :return:
  """
  import params

  try:
    # 'su - hdfs -c "hdfs dfsadmin -report -live"'
    command = 'hdfs dfsadmin -report -live'
    return_code, hdfs_output = shell.call(command, user=params.hdfs_user)
  except:
    raise Fail('Unable to determine if the DataNode has started after upgrade.')

  if return_code == 0:
    if params.hostname.lower() in hdfs_output.lower():
      Logger.info("DataNode {0} reports that it has rejoined the cluster.".format(params.hostname))
      return
    else:
      raise Fail("DataNode {0} was not found in the list of live DataNodes".format(params.hostname))

  # return_code is not 0, fail
  raise Fail("Unable to determine if the DataNode has started after upgrade (result code {0})".format(str(return_code)))
def _check_nodemanager_startup():
  '''
  Checks that a NodeManager is in a RUNNING state in the cluster via
  "yarn node -list -states=RUNNING" command. Once the NodeManager is found to be
  alive this method will return, otherwise it will raise a Fail(...) and retry
  automatically.
  :return:
  '''
  import params

  command = 'yarn node -list -states=RUNNING'

  try:
    # 'su - yarn -c "yarn node -status c6401.ambari.apache.org:45454"'
    return_code, yarn_output = shell.call(command, user=params.hdfs_user)
  except:
    raise Fail('Unable to determine if the NodeManager has started after upgrade.')

  if return_code == 0:
    hostname = params.hostname.lower()
    nodemanager_address = params.nm_address.lower()
    yarn_output = yarn_output.lower()

    if hostname in yarn_output or nodemanager_address in yarn_output:
      Logger.info('NodeManager with ID {0} has rejoined the cluster.'.format(nodemanager_address))
      return
    else:
      raise Fail('NodeManager with ID {0} was not found in the list of running NodeManagers'.format(nodemanager_address))

  raise Fail('Unable to determine if the NodeManager has started after upgrade (result code {0})'.format(str(return_code)))
Exemplo n.º 25
0
def check_process_status(pid_file):
  """
  Function checks whether process is running.
  Process is considered running, if pid file exists, and process with
  a pid, mentioned in pid file is running
  If process is not running, will throw ComponentIsNotRunning exception

  @param pid_file: path to service pid file
  """
  if not pid_file or not os.path.isfile(pid_file):
    raise ComponentIsNotRunning()
  
  try:
    pid = int(sudo.read_file(pid_file))
  except:
    Logger.debug("Pid file {0} does not exist".format(pid_file))
    raise ComponentIsNotRunning()

  code, out = shell.call(["ps","-p", str(pid)])
  
  if code:
    Logger.debug("Process with pid {0} is not running. Stale pid file"
              " at {1}".format(pid, pid_file))
    raise ComponentIsNotRunning()
  pass
Exemplo n.º 26
0
  def check_nifi_process_status(self, pid_file):
    """
    Function checks whether process is running.
    Process is considered running, if pid file exists, and process with
    a pid, mentioned in pid file is running
    If process is not running, will throw ComponentIsNotRunning exception

    @param pid_file: path to service pid file
    """
    if not pid_file or not os.path.isfile(pid_file):
        raise ComponentIsNotRunning()

    try:
        lines = [line.rstrip('\n') for line in open(pid_file)]
        pid = int(lines[2].split('=')[1]);
    except:
        Logger.warn("Pid file {0} does not exist".format(pid_file))
        raise ComponentIsNotRunning()

    code, out = shell.call(["ps","-p", str(pid)])

    if code:
        Logger.debug("Process with pid {0} is not running. Stale pid file"
                     " at {1}".format(pid, pid_file))
        raise ComponentIsNotRunning()
    pass
Exemplo n.º 27
0
def main():
  # add service
  checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'[{{"ServiceInfo":{{"service_name":"{service_name}"}}}}]\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services'.
               format(service_name=SERVICE_NAME, server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
  
  # add components
  for component in COMPONENTS:
    checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'{{"components":[{{"ServiceComponentInfo":{{"component_name":"{component}"}}}}]}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services?ServiceInfo/service_name={service_name}'.
               format(service_name=SERVICE_NAME, component=component, server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
    
  # assign components to hosts
  for x in COMPONENTS_TO_HOSTS: 
    for component, host in x.iteritems():
      checked_call('curl -H \'X-Requested-By:anything\' -i -X POST -d \'{{"host_components":[{{"HostRoles":{{"component_name":"{component}"}}}}]}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/hosts?Hosts/host_name={host}'.
               format(host=host, component=component, server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
    
  # update and create all the service-specific configurations
  checked_call('curl -H \'X-Requested-By:anything\'-X GET -u admin:admin {server_url}/api/v1/stacks2/HDP/versions/{stack_version}/stackServices/{service_name}/configurations?fields=* > /tmp/config.json'.
               format(server_url=SERVER_URL, stack_version=STACK_VERSION, service_name=SERVICE_NAME))
  with open('/tmp/config.json', "r") as f:
    d = json.load(f)
  
  configs = {}
  for x in d['items']:
    site_name = x['StackConfigurations']['type'][:-4]
    if not site_name in configs:
      configs[site_name] = {}
    config = configs[site_name]
    config[x['StackConfigurations']['property_name']] = x['StackConfigurations']['property_value']

  for site_name, site_content in configs.iteritems():
    code = call('/var/lib/tbds-server/resources/scripts/configs.sh get {hostname} {cluster_name} {site_name}'.format(hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name))[0]

    if code:
      print "Adding new site: "+site_name
      checked_call('curl -i -H \'X-Requested-By:anything\' -X PUT -d \'{{"Clusters":{{"desired_configs":{{"type":"{site_name}","tag":"version1","properties":{site_content}}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}'.format(site_name=site_name, site_content=json.dumps(site_content), server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
    else:
      timestamp = int(time.time())
      print "Modifiying site: "+site_name+" version"+str(timestamp)
      checked_call('/var/lib/tbds-server/resources/scripts/configs.sh get {hostname} {cluster_name} {site_name} /tmp/current_site.json'.format(hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name))
      
      with open('/tmp/current_site.json', "r") as f:
        fcontent = f.read()
        d = json.loads("{"+fcontent+"}")
      
      for k,v in site_content.iteritems():
        d['properties'][k] = v
        
      checked_call('curl -i -H \'X-Requested-By:anything\' -X PUT -d \'{{"Clusters":{{"desired_configs":{{"type":"{site_name}","tag":"version{timestamp}","properties":{site_content}}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}'.format(site_name=site_name, timestamp=timestamp, site_content=json.dumps(d['properties']), server_url=SERVER_URL, cluster_name=CLUSTER_NAME))

  for site_name, site_configs in CONFIGS_TO_CHANGE.iteritems():
    for config_name, config_value in site_configs.iteritems():
      print "Adding config "+config_name+"="+config_value+" to "+site_name
      checked_call('/var/lib/tbds-server/resources/scripts/configs.sh set {hostname} {cluster_name} {site_name} {config_name} {config_value}'.format(config_name=config_name, config_value=config_value, hostname=HOSTNAME, cluster_name=CLUSTER_NAME, site_name=site_name))
      
        
  # install all new components
  checked_call('curl -H \'X-Requested-By:anything\' -i -X PUT -d  \'{{"RequestInfo": {{"context" :"Installing Services"}}, "Body": {{"ServiceInfo": {{"state": "INSTALLED"}}}}}}\' -u admin:admin {server_url}/api/v1/clusters/{cluster_name}/services?ServiceInfo/state=INIT'.
             format(server_url=SERVER_URL, cluster_name=CLUSTER_NAME))
Exemplo n.º 28
0
def check_fs_root():
  import params  
  fs_root_url = format("{fs_root}{hive_apps_whs_dir}")
  cmd = "/usr/lib/hive/bin/metatool -listFSRoot 2>/dev/null | grep hdfs://"
  code, out = call(cmd, user=params.hive_user)
  if code == 0 and fs_root_url.strip() != out.strip():
    cmd = format("/usr/lib/hive/bin/metatool -updateLocation {fs_root}{hive_apps_whs_dir} {out}")
    Execute(cmd, user=params.hive_user)
def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary):
  """
  Enter or leave safemode for the Namenode.
  :param user: user to perform action as
  :param safemode_state: Desired state of ON or OFF
  :param in_ha: bool indicating if Namenode High Availability is enabled
  :param hdfs_binary: name/path of the HDFS binary to use
  :return: Returns a tuple of (transition success, original state). If no change is needed, the indicator of
  success will be True
  """
  Logger.info("Prepare to transition into safemode state %s" % safemode_state)
  import params
  original_state = SafeMode.UNKNOWN

  dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary)
  safemode_base_command = dfsadmin_base_command + " -safemode "
  safemode_check_cmd = safemode_base_command + " get"

  grep_pattern = format("Safe mode is {safemode_state}")
  safemode_check_with_grep = format("{safemode_check_cmd} | grep '{grep_pattern}'")

  code, out = shell.call(safemode_check_cmd, user=user, logoutput=True)
  Logger.info("Command: %s\nCode: %d." % (safemode_check_cmd, code))
  if code == 0 and out is not None:
    Logger.info(out)
    re_pattern = r"Safe mode is (\S*)"
    Logger.info("Pattern to search: {0}".format(re_pattern))
    m = re.search(re_pattern, out, re.IGNORECASE)
    if m and len(m.groups()) >= 1:
      original_state = m.group(1).upper()

      if original_state == safemode_state:
        return (True, original_state)
      else:
        # Make a transition
        command = safemode_base_command + safemode_to_instruction[safemode_state]
        Execute(command,
                user=user,
                logoutput=True,
                path=[params.hadoop_bin_dir])

        code, out = shell.call(safemode_check_with_grep, user=user)
        Logger.info("Command: %s\nCode: %d. Out: %s" % (safemode_check_with_grep, code, out))
        if code == 0:
          return (True, original_state)
  return (False, original_state)
Exemplo n.º 30
0
    def status(self, env):
        cmd = "ps -ef | grep proc_rangeradmin | grep -v grep"
        code, output = shell.call(cmd, timeout=20)

        if code != 0:
            Logger.debug("Ranger admin process not running")
            raise ComponentIsNotRunning()
        pass
Exemplo n.º 31
0
    def status(self, env):
        import status_params

        env.set_params(status_params)

        if status_params.stack_supports_pid:
            check_process_status(status_params.ranger_admin_pid_file)
            return

        cmd = 'ps -ef | grep proc_rangeradmin | grep -v grep'
        code, output = shell.call(cmd, timeout=20)

        if code != 0:
            if self.is_ru_rangeradmin_in_progress(
                    status_params.upgrade_marker_file):
                Logger.info(
                    'Ranger admin process not running - skipping as stack upgrade is in progress'
                )
            else:
                Logger.debug('Ranger admin process not running')
                raise ComponentIsNotRunning()
        pass
Exemplo n.º 32
0
def _get_single_version_from_hdp_select():
    """
  Call "hdp-select versions" and return the version string if only one version is available.
  :return: Returns a version string if successful, and None otherwise.
  """
    # Ubuntu returns: "stdin: is not a tty", as subprocess output, so must use a temporary file to store the output.
    tmpfile = tempfile.NamedTemporaryFile()
    tmp_dir = Script.get_tmp_dir()
    tmp_file = os.path.join(tmp_dir, "copy_tarball_out.txt")
    hdp_version = None

    out = None
    get_hdp_versions_cmd = "/usr/bin/hdp-select versions > {0}".format(
        tmp_file)
    try:
        code, stdoutdata = shell.call(get_hdp_versions_cmd, logoutput=True)
        with open(tmp_file, 'r+') as file:
            out = file.read()
    except Exception, e:
        Logger.logger.exception(
            "Could not parse output of {0}. Error: {1}".format(
                str(tmp_file), str(e)))
Exemplo n.º 33
0
    def _exec_cmd(self, command, expect=None):
        if command != "status":
            self.log.info("%s command '%s'" % (self.resource, command))

        custom_cmd = getattr(self.resource, "%s_command" % command, None)
        if custom_cmd:
            self.log.debug("%s executing '%s'" % (self.resource, custom_cmd))
            if hasattr(custom_cmd, "__call__"):
                if custom_cmd():
                    ret = 0
                else:
                    ret = 1
            else:
                ret, out = shell.call(custom_cmd)
        else:
            ret = self._init_cmd(command)

        if expect is not None and expect != ret:
            raise Fail(
                "%r command %s for service %s failed with return code: %d. %s"
                % (self, command, self.resource.service_name, ret, out))
        return ret
Exemplo n.º 34
0
def create(stack_name, package, version, dry_run = False):
  """
  Creates a config version for the specified package
  :param stack_name: the name of the stack
  :param package: the name of the package, as-used by <conf-selector-tool>
  :param version: the version number to create
  :param dry_run: False to create the versioned config directory, True to only return what would be created
  :return List of directories created
  """
  Logger.info("Checking if need to create versioned conf dir /etc/{0}/{1}/0".format(package, version))
  if not _valid(stack_name, package, version):
    Logger.info("Will not create it since parameters are not valid.")
    return []

  command = "dry-run-create" if dry_run else "create-conf-dir"

  code, stdout, stderr = shell.call(_get_cmd(command, package, version), logoutput=False, quiet=False, sudo=True, stderr = subprocess.PIPE)

  # <conf-selector-tool> can set more than one directory
  # per package, so return that list, especially for dry_run
  # > <conf-selector-tool> dry-run-create --package hive-hcatalog --stack-version 2.4.0.0-169 0
  # /etc/hive-webhcat/2.4.0.0-169/0
  # /etc/hive-hcatalog/2.4.0.0-169/0
  created_directories = []
  if 0 == code and stdout is not None: # just be sure we have a stdout
    for line in stdout.splitlines():
      created_directories.append(line.rstrip('\n'))

  # if directories were created, then do some post-processing
  if not code and stdout and not dry_run:
    # take care of permissions if directories were created
    for directory in created_directories:
      Directory(directory, mode=0755, cd_access='a', create_parents=True)

    # seed the new directories with configurations from the old (current) directories
    _seed_new_configuration_directories(package, created_directories)

  return created_directories
Exemplo n.º 35
0
def get_uid(user, return_existing=False):
    """
  Tries to get UID for username. It will try to find UID in custom properties in *cluster_env* and, if *return_existing=True*,
  it will try to return UID of existing *user*.

  :param user: username to get UID for
  :param return_existing: return UID for existing user
  :return:
  """
    import params
    user_str = str(user) + "_uid"
    service_env = [
        serviceEnv for serviceEnv in params.config['configurations']
        if user_str in params.config['configurations'][serviceEnv]
    ]

    if service_env and params.config['configurations'][
            service_env[0]][user_str]:
        service_env_str = str(service_env[0])
        uid = params.config['configurations'][service_env_str][user_str]
        if len(service_env) > 1:
            Logger.warning(
                "Multiple values found for %s, using %s" % (user_str, uid))
        return uid
    else:
        if return_existing:
            # pick up existing UID or try to find available UID in /etc/passwd, see changeToSecureUid.sh for more info
            if user == params.smoke_user:
                return None
            File(
                format("{tmp_dir}/changeUid.sh"),
                content=StaticFile("changeToSecureUid.sh"),
                mode=0555)
            code, newUid = shell.call(format("{tmp_dir}/changeUid.sh {user}"))
            return int(newUid)
        else:
            # do not return UID for existing user, used in User resource call to let OS to choose UID for us
            return None
Exemplo n.º 36
0
def bootstrap_standby_namenode(params, use_path=False):

    bin_path = os.path.join(params.hadoop_bin_dir, '') if use_path else ""

    try:
        iterations = 50
        bootstrap_cmd = format(
            "{bin_path}hdfs namenode -bootstrapStandby -nonInteractive")
        # Blue print based deployments start both NN in parallel and occasionally
        # the first attempt to bootstrap may fail. Depending on how it fails the
        # second attempt may not succeed (e.g. it may find the folder and decide that
        # bootstrap succeeded). The solution is to call with -force option but only
        # during initial start
        if params.command_phase == "INITIAL_START":
            bootstrap_cmd = format(
                "{bin_path}hdfs namenode -bootstrapStandby -nonInteractive -force"
            )
        Logger.info("Boostrapping standby namenode: %s" % (bootstrap_cmd))
        for i in range(iterations):
            Logger.info('Try %d out of %d' % (i + 1, iterations))
            code, out = shell.call(bootstrap_cmd,
                                   logoutput=False,
                                   user=params.hdfs_user)
            if code == 0:
                Logger.info("Standby namenode bootstrapped successfully")
                return True
            elif code == 5:
                Logger.info("Standby namenode already bootstrapped")
                return True
            else:
                Logger.warning(
                    'Bootstrap standby namenode failed with %d error code. Will retry'
                    % (code))
    except Exception as ex:
        Logger.error(
            'Bootstrap standby namenode threw an exception. Reason %s' %
            (str(ex)))
    return False
Exemplo n.º 37
0
def _check_nodemanager_startup():
    '''
  Checks that a NodeManager is in a RUNNING state in the cluster via
  "yarn node -list -states=RUNNING" command. Once the NodeManager is found to be
  alive this method will return, otherwise it will raise a Fail(...) and retry
  automatically.
  :return:
  '''
    import params

    command = 'yarn node -list -states=RUNNING'

    try:
        # 'su - yarn -c "yarn node -status c6401.ambari.apache.org:45454"'
        return_code, yarn_output = shell.call(command, user=params.hdfs_user)
    except:
        raise Fail(
            'Unable to determine if the NodeManager has started after upgrade.'
        )

    if return_code == 0:
        hostname = params.hostname.lower()
        nodemanager_address = params.nm_address.lower()
        yarn_output = yarn_output.lower()

        if hostname in yarn_output or nodemanager_address in yarn_output:
            Logger.info(
                'NodeManager with ID {0} has rejoined the cluster.'.format(
                    nodemanager_address))
            return
        else:
            raise Fail(
                'NodeManager with ID {0} was not found in the list of running NodeManagers'
                .format(nodemanager_address))

    raise Fail(
        'Unable to determine if the NodeManager has started after upgrade (result code {0})'
        .format(str(return_code)))
Exemplo n.º 38
0
def _check_datanode_startup(hdfs_binary):
    """
  Checks that a DataNode is reported as being alive via the
  "hdfs dfsadmin -fs {namenode_address} -report -live" command. Once the DataNode is found to be
  alive this method will return, otherwise it will raise a Fail(...) and retry
  automatically.
  :param hdfs_binary: name/path of the HDFS binary to use
  :return:
  """
    import params
    import socket

    try:
        dfsadmin_base_command = get_dfsadmin_base_command(hdfs_binary)
        command = dfsadmin_base_command + ' -report -live'
        return_code, hdfs_output = shell.call(command, user=params.hdfs_user)
    except:
        raise Fail(
            'Unable to determine if the DataNode has started after upgrade.')

    if return_code == 0:
        hostname = params.hostname.lower()
        hostname_ip = socket.gethostbyname(params.hostname.lower())
        if hostname in hdfs_output.lower() or hostname_ip in hdfs_output.lower(
        ):
            Logger.info(
                "DataNode {0} reports that it has rejoined the cluster.".
                format(params.hostname))
            return
        else:
            raise Fail(
                "DataNode {0} was not found in the list of live DataNodes".
                format(params.hostname))

    # return_code is not 0, fail
    raise Fail(
        "Unable to determine if the DataNode has started after upgrade (result code {0})"
        .format(str(return_code)))
Exemplo n.º 39
0
def get_stack_version(package_name):
    """
  @param package_name, name of the package, from which, function will try to get stack version
  """

    stack_selector_path = stack_tools.get_stack_tool_path(
        stack_tools.STACK_SELECTOR_NAME)

    if not os.path.exists(stack_selector_path):
        Logger.info(
            'Skipping get_stack_version since " + stack_selector_tool + " is not yet available'
        )
        return None  # lazy fail

    try:
        command = 'ambari-python-wrap {stack_selector_path} status {package_name}'.format(
            stack_selector_path=stack_selector_path, package_name=package_name)
        return_code, stack_output = shell.call(command, timeout=20)
    except Exception, e:
        Logger.error(str(e))
        raise Fail(
            'Unable to execute " + stack_selector_path + " command to retrieve the version.'
        )
Exemplo n.º 40
0
def delete_write_lock_files():
    import params

    if params.security_enabled:
        kinit_if_needed = format(
                '{kinit_path_local} {hdfs_principal_name} -kt {hdfs_user_keytab}; ')
    else:
        kinit_if_needed = ''

    hadoop_prefix = format('{kinit_if_needed}hadoop --config {hadoop_conf_dir} dfs')
    code, output = call(format('{hadoop_prefix} -ls {solr_hdfs_directory}'))
    collections = get_collection_paths(output)

    if params.solr_cloud_mode:
        write_locks_to_delete = get_write_lock_files_solr_cloud(hadoop_prefix, collections)
    else:
        write_locks_to_delete = get_write_lock_files_solr_standalone(collections)

    if len(write_locks_to_delete) > 1:
        Execute(
                format('{hadoop_prefix} -rm -f {write_locks_to_delete}'),
                user=params.hdfs_user
                )
Exemplo n.º 41
0
def check_fs_root(conf_dir, execution_path):
  import params

  if not params.manage_hive_fsroot:
    Logger.info("Skipping fs root check as cluster-env/manage_hive_fsroot is disabled")
    return

  if not params.fs_root.startswith("hdfs://"):
    Logger.info("Skipping fs root check as fs_root does not start with hdfs://")
    return

  metatool_cmd = format("hive --config {conf_dir} --service metatool")
  cmd = as_user(format("{metatool_cmd} -listFSRoot", env={'PATH': execution_path}), params.hive_user) \
        + format(" 2>/dev/null | grep hdfs:// | cut -f1,2,3 -d '/' | grep -v '{fs_root}' | head -1")
  code, out = shell.call(cmd)

  if code == 0 and out.strip() != "" and params.fs_root.strip() != out.strip():
    out = out.strip()
    cmd = format("{metatool_cmd} -updateLocation {fs_root} {out}")
    Execute(cmd,
            user=params.hive_user,
            environment={'PATH': execution_path}
    )
Exemplo n.º 42
0
def _get_single_version_from_stack_select():
    """
  Call "<stack-selector> versions" and return the version string if only one version is available.
  :return: Returns a version string if successful, and None otherwise.
  """
    # Ubuntu returns: "stdin: is not a tty", as subprocess32 output, so must use a temporary file to store the output.
    tmp_dir = Script.get_tmp_dir()
    tmp_file = os.path.join(tmp_dir, "copy_tarball_out.txt")
    stack_version = None

    out = None
    stack_selector_path = stack_tools.get_stack_tool_path(
        stack_tools.STACK_SELECTOR_NAME)
    get_stack_versions_cmd = "{0} versions > {1}".format(
        stack_selector_path, tmp_file)
    try:
        code, stdoutdata = shell.call(get_stack_versions_cmd, logoutput=True)
        with open(tmp_file, 'r+') as file:
            out = file.read()
    except Exception, e:
        Logger.logger.exception(
            "Could not parse output of {0}. Error: {1}".format(
                str(tmp_file), str(e)))
Exemplo n.º 43
0
def check_folder_until_size_not_changes(dir):
    """
  Call du -d 0 <folder> | cut -f 1 on specific directory until the size not changes (so copy operation has finished)
  """
    cmd = format("du -d 0 {dir} | cut -f 1")
    size_changed = True
    size_str = "-1"
    while size_changed:
        returncode, stdout = call(cmd,
                                  user=params.infra_solr_user,
                                  timeout=300)
        if stdout:
            actual_size_str = stdout.strip()
            if actual_size_str == size_str:
                size_changed = False
                continue
            else:
                Logger.info(
                    format(
                        "Actual size of '{dir}' is {actual_size_str}, wait 5 sec and check again, to make sure no copy operation is in progress..."
                    ))
                time.sleep(5)
                size_str = actual_size_str
Exemplo n.º 44
0
 def _chk_writable_mount(self, mount_point):
     if os.geteuid() == 0:
         return os.access(mount_point, os.W_OK)
     else:
         try:
             # test if mount point is writable for current user
             call_result = call(
                 ['test', '-w', mount_point],
                 sudo=True,
                 timeout=int(Hardware.CHECK_REMOTE_MOUNTS_TIMEOUT_DEFAULT) /
                 2,
                 quiet=not logger.isEnabledFor(logging.DEBUG))
             return call_result and call_result[0] == 0
         except ExecuteTimeoutException:
             logger.exception(
                 "Exception happened while checking mount {0}".format(
                     mount_point))
             return False
         except Fail:
             logger.exception(
                 "Exception happened while checking mount {0}".format(
                     mount_point))
             return False
Exemplo n.º 45
0
    def is_directory_exists_in_HDFS(self, path, as_user):
        kinit_path_local = get_kinit_path(
            default('/configurations/kerberos-env/executable_search_paths',
                    None))
        kinit_if_needed = format(
            "{kinit_path_local} -kt {zeppelin_kerberos_keytab} {zeppelin_kerberos_principal};"
        )

        #-d: if the path is a directory, return 0.
        path_exists = shell.call(format(
            "{kinit_if_needed} hdfs --config {hadoop_conf_dir} dfs -test -d {path};echo $?"
        ),
                                 user=as_user)[1]

        # if there is no kerberos setup then the string will contain "-bash: kinit: command not found"
        if "\n" in path_exists:
            path_exists = path_exists.split("\n").pop()

        # '1' means it does not exists
        if path_exists == '0':
            return True
        else:
            return False
Exemplo n.º 46
0
  def check_and_copy_notebook_in_hdfs(self, params):
    if params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir'].startswith("/"):
      notebook_directory = params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir']
    else:
      notebook_directory = "/user/" + format("{zeppelin_user}") + "/" + \
                           params.config['configurations']['zeppelin-config']['zeppelin.notebook.dir']

    kinit_path_local = get_kinit_path(default('/configurations/kerberos-env/executable_search_paths', None))
    kinit_if_needed = format("{kinit_path_local} -kt {zeppelin_kerberos_keytab} {zeppelin_kerberos_principal};")

    notebook_directory_exists = shell.call(format("{kinit_if_needed} hdfs --config {hadoop_conf_dir} dfs -test -e {notebook_directory};echo $?"),
                                           user=params.zeppelin_user)[1]

    #if there is no kerberos setup then the string will contain "-bash: kinit: command not found"
    if "\n" in notebook_directory_exists:
      notebook_directory_exists = notebook_directory_exists.split("\n")[1]

    # '1' means it does not exists
    if notebook_directory_exists == '1':
      # hdfs dfs -mkdir {notebook_directory}
      params.HdfsResource(format("{notebook_directory}"),
                          type="directory",
                          action="create_on_execute",
                          owner=params.zeppelin_user,
                          recursive_chown=True,
                          recursive_chmod=True
                          )

      # hdfs dfs -put /usr/hdp/current/zeppelin-server/notebook/ {notebook_directory}
      params.HdfsResource(format("{notebook_directory}"),
                            type="directory",
                            action="create_on_execute",
                            source=params.notebook_dir,
                            owner=params.zeppelin_user,
                            recursive_chown=True,
                            recursive_chmod=True
                            )
Exemplo n.º 47
0
def get_component_version(stack_name, component_name):
  """
  For any stack name, returns the version currently installed for a given component.
  Because each stack name may have different logic, the input is a generic dictionary.
  :param stack_name: one of HDP, HDPWIN, BIGTOP, PHD, etc. usually retrieved from
  the command-#.json file's ["hostLevelParams"]["stack_name"]
  :param component_name: Component name as a string necessary to get the version
  :return: Returns a string if found, e.g., 2.2.1.0-2175, otherwise, returns None
  """
  version = None
  if stack_name is None or component_name is None:
    Logger.error("Could not determine component version because of the parameters is empty. " \
                 "stack_name: %s, component_name: %s" % (str(stack_name), str(component_name)))
    return version

  out = None
  code = -1
  if stack_name == "HDP":
    tmpfile = tempfile.NamedTemporaryFile()

    get_hdp_comp_version_cmd = ""
    try:
      # This is necessary because Ubuntu returns "stdin: is not a tty", see AMBARI-8088
      with open(tmpfile.name, 'r') as file:
        get_hdp_comp_version_cmd = '/usr/bin/hdp-select status %s > %s' % (component_name, tmpfile.name)
        code, stdoutdata = shell.call(get_hdp_comp_version_cmd)
        out = file.read()

      if code != 0 or out is None:
        raise Exception("Code is nonzero or output is empty")

      Logger.debug("Command: %s\nOutput: %s" % (get_hdp_comp_version_cmd, str(out)))
      matches = re.findall(r"([\d\.]+\-\d+)", out)
      version = matches[0] if matches and len(matches) > 0 else None
    except Exception, e:
      Logger.error("Could not determine HDP version for component %s by calling '%s'. Return Code: %s, Output: %s." %
                   (component_name, get_hdp_comp_version_cmd, str(code), str(out)))
Exemplo n.º 48
0
def get_check_command(oozie_url, host_name, parameters):
  security_enabled = False
  if SECURITY_ENABLED in parameters:
    security_enabled = str(parameters[SECURITY_ENABLED]).upper() == 'TRUE'
  kerberos_env = None
  if security_enabled:
    if OOZIE_KEYTAB in parameters and OOZIE_PRINCIPAL in parameters:
      oozie_keytab = parameters[OOZIE_KEYTAB]
      oozie_principal = parameters[OOZIE_PRINCIPAL]

      # substitute _HOST in kerberos principal with actual fqdn
      oozie_principal = oozie_principal.replace('_HOST', host_name)
    else:
      raise KerberosPropertiesNotFound('The Oozie keytab and principal are required parameters when security is enabled.')

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl
    env = Environment.get_instance()
    ccache_file = "{0}{1}oozie_alert_cc_{2}".format(env.tmp_dir, os.sep, os.getpid())
    kerberos_env = {'KRB5CCNAME': ccache_file}

    klist_path_local = get_klist_path()
    klist_command = format("{klist_path_local} -s {ccache_file}")

    # Determine if we need to kinit by testing to see if the relevant cache exists and has
    # non-expired tickets.  Tickets are marked to expire after 5 minutes to help reduce the number
    # it kinits we do but recover quickly when keytabs are regenerated
    return_code, _ = call(klist_command)
    if return_code != 0:
      kinit_path_local = get_kinit_path()
      kinit_command = format("{kinit_path_local} -l 5m -kt {oozie_keytab} {oozie_principal}; ")

      # kinit
      Execute(kinit_command, environment=kerberos_env)
  command = format("source /etc/oozie/conf/oozie-env.sh ; oozie admin -oozie {oozie_url} -status")
  return (command, kerberos_env)
Exemplo n.º 49
0
def initialize_ha_zookeeper(params):
  try:
    iterations = 10
    formatZK_cmd = "hdfs zkfc -formatZK -nonInteractive"
    Logger.info("Initialize HA state in ZooKeeper: %s" % (formatZK_cmd))
    for i in range(iterations):
      Logger.info('Try %d out of %d' % (i+1, iterations))
      code, out = shell.call(formatZK_cmd, logoutput=False, user=params.hdfs_user)
      if code == 0:
        Logger.info("HA state initialized in ZooKeeper successfully")
        return True
      elif code == 2:
        Logger.info("HA state already initialized in ZooKeeper")
        return True
      # Precondition to starting zkfc is being formatted.
      # So zkfc being already started means format was already done.
      elif code == 1 and "zkfc is running as process " in out:
        Logger.info("HA state already initialized in ZooKeeper, since '{0}'".format(out))
        return True
      else:
        Logger.warning('HA state initialization in ZooKeeper failed with %d error code. Will retry' % (code))
  except Exception as ex:
    Logger.error('HA state initialization in ZooKeeper threw an exception. Reason %s' %(str(ex)))
  return False
Exemplo n.º 50
0
def kill_zkfc(zkfc_user):
  """
  There are two potential methods for failing over the namenode, especially during a Rolling Upgrade.
  Option 1. Kill zkfc on primary namenode provided that the secondary is up and has zkfc running on it.
  Option 2. Silent failover
  :param zkfc_user: User that started the ZKFC process.
  :return: Return True if ZKFC was killed, otherwise, false.
  """
  import params
  if params.dfs_ha_enabled:
    if params.zkfc_pid_file:
      check_process = as_user(format("ls {zkfc_pid_file} > /dev/null 2>&1 && ps -p `cat {zkfc_pid_file}` > /dev/null 2>&1"), user=zkfc_user)
      code, out = shell.call(check_process)
      if code == 0:
        Logger.debug("ZKFC is running and will be killed.")
        kill_command = format("kill -15 `cat {zkfc_pid_file}`")
        Execute(kill_command,
                user=zkfc_user
        )
        File(params.zkfc_pid_file,
             action = "delete",
             )
        return True
  return False
Exemplo n.º 51
0
  def prepare_warfile():
    """
    Invokes the 'prepare-war' command in Oozie in order to create the WAR.
    The prepare-war command uses the input WAR from ${OOZIE_HOME}/oozie.war and
    outputs the prepared WAR to ${CATALINA_BASE}/webapps/oozie.war - because of this,
    both of these environment variables must point to the upgraded oozie-server path and
    not oozie-client since it was not yet updated.

    This method will also perform a kinit if necessary.
    :return:
    """
    import params

    # get the kerberos token if necessary to execute commands as oozie
    if params.security_enabled:
      oozie_principal_with_host = params.oozie_principal.replace("_HOST", params.hostname)
      command = format("{kinit_path_local} -kt {oozie_keytab} {oozie_principal_with_host}")
      Execute(command, user=params.oozie_user, logoutput=True)

    # setup environment
    environment = { "CATALINA_BASE" : "/usr/hdp/current/oozie-server/oozie-server",
      "OOZIE_HOME" : "/usr/hdp/current/oozie-server" }

    # prepare the oozie WAR
    command = format("{oozie_setup_sh} prepare-war {oozie_secure} -d {oozie_libext_dir}")
    return_code, oozie_output = shell.call(command, user=params.oozie_user,
      logoutput=False, quiet=False, env=environment)

    # set it to "" in to prevent a possible iteration issue
    if oozie_output is None:
      oozie_output = ""

    if return_code != 0 or "New Oozie WAR file with added".lower() not in oozie_output.lower():
      message = "Unexpected Oozie WAR preparation output {0}".format(oozie_output)
      Logger.error(message)
      raise Fail(message)
Exemplo n.º 52
0
def create(stack_name, package, version, dry_run=False):
    """
  Creates a config version for the specified package
  :param stack_name: the name of the stack
  :param package: the name of the package, as-used by conf-select
  :param version: the version number to create
  :return List of directories created
  """
    Logger.info(
        "Checking if need to create versioned conf dir /etc/{0}/{1}/0".format(
            package, version))
    if not _valid(stack_name, package, version):
        Logger.info("Will not create it since parameters are not valid.")
        return []

    command = "dry-run-create" if dry_run else "create-conf-dir"

    code, stdout, stderr = shell.call(get_cmd(command, package, version),
                                      logoutput=False,
                                      quiet=False,
                                      sudo=True,
                                      stderr=subprocess.PIPE)

    # conf-select can set more than one directory
    # per package, so return that list, especially for dry_run
    dirs = []
    if 0 == code and stdout is not None:  # just be sure we have a stdout
        for line in stdout.splitlines():
            dirs.append(line.rstrip('\n'))

    # take care of permissions
    if not code and stdout and command == "create-conf-dir":
        for d in dirs:
            Directory(d, mode=0755, cd_access='a', create_parents=True)

    return dirs
Exemplo n.º 53
0
def get_uid(user):
    import params
    user_str = str(user) + "_uid"
    service_env = [
        serviceEnv for serviceEnv in params.config['configurations']
        if user_str in params.config['configurations'][serviceEnv]
    ]

    if service_env and params.config['configurations'][
            service_env[0]][user_str]:
        service_env_str = str(service_env[0])
        uid = params.config['configurations'][service_env_str][user_str]
        if len(service_env) > 1:
            Logger.warning("Multiple values found for %s, using %s" %
                           (user_str, uid))
        return uid
    else:
        if user == params.smoke_user:
            return None
        File(format("{tmp_dir}/changeUid.sh"),
             content=StaticFile("changeToSecureUid.sh"),
             mode=0555)
        code, newUid = shell.call(format("{tmp_dir}/changeUid.sh {user}"))
        return int(newUid)
Exemplo n.º 54
0
 def locales(self):
     code, out = shell.call("locale -a")
     return out.strip().split("\n")
Exemplo n.º 55
0
 def _check_existence(self, name):
     code, out = shell.call(CHECK_CMD % name)
     return not bool(code)
Exemplo n.º 56
0
 def machine(self):
     code, out = shell.call(["/bin/uname", "-m"])
     return out.strip()
Exemplo n.º 57
0
def initiate_safe_zkfc_failover():
    """
  If this is the active namenode, initiate a safe failover and wait for it to become the standby.

  If an error occurs, force a failover to happen by killing zkfc on this host. In this case, during the Restart,
  will also have to start ZKFC manually.
  """
    import params

    # Must kinit before running the HDFS command
    if params.security_enabled:
        Execute(format(
            "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"),
                user=params.hdfs_user)

    active_namenode_id = None
    standby_namenode_id = None
    active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states(
        params.hdfs_site, params.security_enabled, params.hdfs_user)
    if active_namenodes:
        active_namenode_id = active_namenodes[0][0]
    if standby_namenodes:
        standby_namenode_id = standby_namenodes[0][0]

    if active_namenode_id:
        Logger.info(format("Active NameNode id: {active_namenode_id}"))
    if standby_namenode_id:
        Logger.info(format("Standby NameNode id: {standby_namenode_id}"))
    if unknown_namenodes:
        for unknown_namenode in unknown_namenodes:
            Logger.info("NameNode HA state for {0} is unknown".format(
                unknown_namenode[0]))

    if params.namenode_id == active_namenode_id and params.other_namenode_id == standby_namenode_id:
        # Failover if this NameNode is active and other NameNode is up and in standby (i.e. ready to become active on failover)
        Logger.info(
            format(
                "NameNode {namenode_id} is active and NameNode {other_namenode_id} is in standby"
            ))

        failover_command = format(
            "hdfs haadmin -ns {dfs_ha_nameservices} -failover {namenode_id} {other_namenode_id}"
        )
        check_standby_cmd = format(
            "hdfs haadmin -ns {dfs_ha_nameservices} -getServiceState {namenode_id} | grep standby"
        )

        msg = "Rolling Upgrade - Initiating a ZKFC failover on active NameNode host {0}.".format(
            params.hostname)
        Logger.info(msg)
        code, out = shell.call(failover_command,
                               user=params.hdfs_user,
                               logoutput=True)
        Logger.info(
            format("Rolling Upgrade - failover command returned {code}"))
        wait_for_standby = False

        if code == 0:
            wait_for_standby = True
        else:
            # Try to kill ZKFC manually
            was_zkfc_killed = kill_zkfc(params.hdfs_user)
            code, out = shell.call(check_standby_cmd,
                                   user=params.hdfs_user,
                                   logoutput=True)
            Logger.info(
                format("Rolling Upgrade - check for standby returned {code}"))
            if code == 255 and out:
                Logger.info("Rolling Upgrade - NameNode is already down.")
            else:
                if was_zkfc_killed:
                    # Only mandate that this be the standby namenode if ZKFC was indeed killed to initiate a failover.
                    wait_for_standby = True

        if wait_for_standby:
            Logger.info("Waiting for this NameNode to become the standby one.")
            Execute(check_standby_cmd,
                    user=params.hdfs_user,
                    tries=50,
                    try_sleep=6,
                    logoutput=True)
    else:
        msg = "Rolling Upgrade - Skipping ZKFC failover on NameNode host {0}.".format(
            params.hostname)
        Logger.info(msg)
Exemplo n.º 58
0
def namenode(action=None,
             hdfs_binary=None,
             do_format=True,
             upgrade_type=None,
             env=None):
    if action is None:
        raise Fail('"action" parameter is required for function namenode().')

    if action in ["start", "stop"] and hdfs_binary is None:
        raise Fail(
            '"hdfs_binary" parameter is required for function namenode().')

    if action == "configure":
        import params
        #we need this directory to be present before any action(HA manual steps for
        #additional namenode)
        create_name_dirs(params.dfs_name_dir)
    elif action == "start":
        Logger.info("Called service {0} with upgrade_type: {1}".format(
            action, str(upgrade_type)))
        setup_ranger_hdfs(upgrade_type=upgrade_type)
        import params
        if do_format:
            format_namenode()
            pass

        File(params.exclude_file_path,
             content=Template("exclude_hosts_list.j2"),
             owner=params.hdfs_user,
             group=params.user_group)

        if params.dfs_ha_enabled and \
          params.dfs_ha_namenode_standby is not None and \
          params.hostname == params.dfs_ha_namenode_standby:
            # if the current host is the standby NameNode in an HA deployment
            # run the bootstrap command, to start the NameNode in standby mode
            # this requires that the active NameNode is already up and running,
            # so this execute should be re-tried upon failure, up to a timeout
            success = bootstrap_standby_namenode(params)
            if not success:
                raise Fail("Could not bootstrap standby namenode")

        if upgrade_type == "rolling" and params.dfs_ha_enabled:
            # Most likely, ZKFC is up since RU will initiate the failover command. However, if that failed, it would have tried
            # to kill ZKFC manually, so we need to start it if not already running.
            safe_zkfc_op(action, env)

        options = ""
        if upgrade_type == "rolling":
            options = "-rollingUpgrade started"
        elif upgrade_type == "nonrolling":
            is_previous_image_dir = is_previous_fs_image()
            Logger.info(
                format(
                    "Previous file system image dir present is {is_previous_image_dir}"
                ))

            if params.dfs_ha_enabled:
                if params.desired_namenode_role is None:
                    raise Fail(
                        "Did not receive parameter \"desired_namenode_role\" to indicate the role that this NameNode should have."
                    )

                if params.desired_namenode_role == "active":
                    # The "-upgrade" command can only be used exactly once. If used more than once during a retry, it will cause problems.
                    options = "" if is_previous_image_dir else "-upgrade"

                if params.desired_namenode_role == "standby":
                    options = "-bootstrapStandby -force"
            else:
                # Both Primary and Secondary NameNode can use the same command.
                options = "" if is_previous_image_dir else "-upgrade"
        Logger.info(format("Option for start command: {options}"))

        service(action="start",
                name="namenode",
                user=params.hdfs_user,
                options=options,
                create_pid_dir=True,
                create_log_dir=True)

        if params.security_enabled:
            Execute(format(
                "{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"
            ),
                    user=params.hdfs_user)

        is_namenode_safe_mode_off = format(
            "{hdfs_binary} dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'"
        )
        if params.dfs_ha_enabled:
            is_active_namenode_cmd = as_user(format(
                "{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"
            ),
                                             params.hdfs_user,
                                             env={
                                                 'PATH': params.hadoop_bin_dir
                                             })
        else:
            is_active_namenode_cmd = False

        # During NonRolling Upgrade, both NameNodes are initially down,
        # so no point in checking if this is the active or standby.
        if upgrade_type == "nonrolling":
            is_active_namenode_cmd = False

        # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____|
        # no-HA                 | ON -> OFF                | Yes                      |
        # HA and active         | ON -> OFF                | Yes                      |
        # HA and standby        | no change                | no check                 |
        # RU with HA on active  | ON -> OFF                | Yes                      |
        # RU with HA on standby | ON -> OFF                | Yes                      |
        # EU with HA on active  | no change                | no check                 |
        # EU with HA on standby | no change                | no check                 |
        # EU non-HA             | no change                | no check                 |

        check_for_safemode_off = False
        msg = ""
        if params.dfs_ha_enabled:
            if upgrade_type is not None:
                check_for_safemode_off = True
                msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade"
            else:
                # During normal operations, the NameNode is expected to be up.
                code, out = shell.call(
                    is_active_namenode_cmd,
                    logoutput=True)  # If active NN, code will be 0
                if code == 0:  # active
                    check_for_safemode_off = True
                    msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode."
                else:
                    msg = "Will remain in the current safemode state."
        else:
            msg = "Must wait to leave safemode since High Availability is not enabled."
            check_for_safemode_off = True

        Logger.info(msg)

        # During a NonRolling (aka Express Upgrade), stay in safemode since the DataNodes are down.
        stay_in_safe_mode = False
        if upgrade_type == "nonrolling":
            stay_in_safe_mode = True

        if check_for_safemode_off:
            Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode))
            if not stay_in_safe_mode:
                Logger.info(
                    "Wait to leafe safemode since must transition from ON to OFF."
                )
                try:
                    # Wait up to 30 mins
                    Execute(is_namenode_safe_mode_off,
                            tries=180,
                            try_sleep=10,
                            user=params.hdfs_user,
                            logoutput=True)
                except Fail:
                    Logger.error(
                        "NameNode is still in safemode, please be careful with commands that need safemode OFF."
                    )

        # Always run this on non-HA, or active NameNode during HA.
        create_hdfs_directories(is_active_namenode_cmd)

    elif action == "stop":
        import params
        service(action="stop", name="namenode", user=params.hdfs_user)
    elif action == "status":
        import status_params
        check_process_status(status_params.namenode_pid_file)
    elif action == "decommission":
        decommission()
Exemplo n.º 59
0
def curl_krb_request(tmp_dir,
                     keytab,
                     principal,
                     url,
                     cache_file_prefix,
                     krb_exec_search_paths,
                     return_only_http_code,
                     caller_label,
                     user,
                     connection_timeout=CONNECTION_TIMEOUT_DEFAULT,
                     ca_certs=None,
                     kinit_timer_ms=DEFAULT_KERBEROS_KINIT_TIMER_MS,
                     method='',
                     body='',
                     header=''):
    """
  Makes a curl request using the kerberos credentials stored in a calculated cache file. The
  cache file is created by combining the supplied principal, keytab, user, and request name into
  a unique hash.

  This function will use the klist command to determine if the cache is expired and will perform
  a kinit if necessary. Additionally, it has an internal timer to force a kinit after a
  configurable amount of time. This is to prevent boundary issues where requests hit the edge
  of a ticket's lifetime.

  :param tmp_dir: the directory to use for storing the local kerberos cache for this request.
  :param keytab: the location of the keytab to use when performing a kinit
  :param principal: the principal to use when performing a kinit
  :param url: the URL to request
  :param cache_file_prefix: an identifier used to build the unique cache name for this request.
                            This ensures that multiple requests can use the same cache.
  :param krb_exec_search_paths: the search path to use for invoking kerberos binaries
  :param return_only_http_code: True to return only the HTTP code, False to return GET content
  :param caller_label: an identifier to give context into the caller of this module (used for logging)
  :param user: the user to invoke the curl command as
  :param connection_timeout: if specified, a connection timeout for curl (default 10 seconds)
  :param ca_certs: path to certificates
  :param kinit_timer_ms: if specified, the time (in ms), before forcing a kinit even if the
                         klist cache is still valid.
  :return:
  """

    import uuid
    # backward compatibility with old code and management packs, etc. All new code need pass ca_certs explicitly
    if ca_certs is None:
        try:
            from ambari_agent.AmbariConfig import AmbariConfig
            ca_certs = AmbariConfig.get_resolved_config(
            ).get_ca_cert_file_path()
        except:
            pass
    # start off false
    is_kinit_required = False

    # Create the kerberos credentials cache (ccache) file and set it in the environment to use
    # when executing curl. Use the md5 hash of the combination of the principal and keytab file
    # to generate a (relatively) unique cache filename so that we can use it as needed. Scope
    # this file by user in order to prevent sharing of cache files by multiple users.
    ccache_file_name = _md5("{0}|{1}".format(principal, keytab)).hexdigest()

    curl_krb_cache_path = os.path.join(tmp_dir, "curl_krb_cache")
    if not os.path.exists(curl_krb_cache_path):
        os.makedirs(curl_krb_cache_path)
    os.chmod(curl_krb_cache_path, 01777)

    ccache_file_path = "{0}{1}{2}_{3}_cc_{4}".format(curl_krb_cache_path,
                                                     os.sep, cache_file_prefix,
                                                     user, ccache_file_name)
    kerberos_env = {'KRB5CCNAME': ccache_file_path}

    # concurrent kinit's can cause the following error:
    # Internal credentials cache error while storing credentials while getting initial credentials
    kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
    kinit_lock.acquire()
    try:
        # If there are no tickets in the cache or they are expired, perform a kinit, else use what
        # is in the cache
        if krb_exec_search_paths:
            klist_path_local = get_klist_path(krb_exec_search_paths)
        else:
            klist_path_local = get_klist_path()

        # take a look at the last time kinit was run for the specified cache and force a new
        # kinit if it's time; this helps to avoid problems approaching ticket boundary when
        # executing a klist and then a curl
        last_kinit_time = _KINIT_CACHE_TIMES.get(ccache_file_name, 0)
        current_time = long(time.time())
        if current_time - kinit_timer_ms > last_kinit_time:
            is_kinit_required = True

        # if the time has not expired, double-check that the cache still has a valid ticket
        if not is_kinit_required:
            klist_command = "{0} -s {1}".format(klist_path_local,
                                                ccache_file_path)
            is_kinit_required = (shell.call(klist_command, user=user)[0] != 0)

        # if kinit is required, the perform the kinit
        if is_kinit_required:
            if krb_exec_search_paths:
                kinit_path_local = get_kinit_path(krb_exec_search_paths)
            else:
                kinit_path_local = get_kinit_path()

            logger.debug(
                "Enabling Kerberos authentication for %s via GSSAPI using ccache at %s",
                caller_label, ccache_file_path)

            # kinit; there's no need to set a ticket timeout as this will use the default invalidation
            # configured in the krb5.conf - regenerating keytabs will not prevent an existing cache
            # from working correctly
            shell.checked_call("{0} -c {1} -kt {2} {3} > /dev/null".format(
                kinit_path_local, ccache_file_path, keytab, principal),
                               user=user)

            # record kinit time
            _KINIT_CACHE_TIMES[ccache_file_name] = current_time
        else:
            # no kinit needed, use the cache
            logger.debug(
                "Kerberos authentication for %s via GSSAPI already enabled using ccache at %s.",
                caller_label, ccache_file_path)
    finally:
        kinit_lock.release()

    # check if cookies dir exists, if not then create it
    cookies_dir = os.path.join(tmp_dir, "cookies")

    if not os.path.exists(cookies_dir):
        os.makedirs(cookies_dir)

    cookie_file_name = str(uuid.uuid4())
    cookie_file = os.path.join(cookies_dir, cookie_file_name)

    start_time = time.time()
    error_msg = None

    # setup timeouts for the request; ensure we use integers since that is what curl needs
    connection_timeout = int(connection_timeout)
    maximum_timeout = connection_timeout + 2

    ssl_options = ['-k']
    if ca_certs:
        ssl_options = ['--cacert', ca_certs]
    try:
        if return_only_http_code:
            _, curl_stdout, curl_stderr = get_user_call_output(
                ['curl', '--location-trusted'] + ssl_options + [
                    '--negotiate', '-u', ':', '-b', cookie_file, '-c',
                    cookie_file, '-w', '%{http_code}', url,
                    '--connect-timeout',
                    str(connection_timeout), '--max-time',
                    str(maximum_timeout), '-o', '/dev/null'
                ],
                user=user,
                env=kerberos_env)
        else:
            curl_command = ['curl', '--location-trusted'] + ssl_options + [
                '--negotiate', '-u', ':', '-b', cookie_file, '-c', cookie_file,
                url, '--connect-timeout',
                str(connection_timeout), '--max-time',
                str(maximum_timeout)
            ]
            # returns response body
            if len(method) > 0 and len(body) == 0 and len(header) == 0:
                curl_command.extend(['-X', method])

            elif len(method) > 0 and len(body) == 0 and len(header) > 0:
                curl_command.extend(['-H', header, '-X', method])

            elif len(method) > 0 and len(body) > 0 and len(header) == 0:
                curl_command.extend(['-X', method, '-d', body])

            elif len(method) > 0 and len(body) > 0 and len(header) > 0:
                curl_command.extend(['-H', header, '-X', method, '-d', body])

            _, curl_stdout, curl_stderr = get_user_call_output(
                curl_command, user=user, env=kerberos_env)

    except Fail:
        if logger.isEnabledFor(logging.DEBUG):
            logger.exception(
                "Unable to make a curl request for {0}.".format(caller_label))
        raise
    finally:
        if os.path.isfile(cookie_file):
            os.remove(cookie_file)

    # empty quotes evaluates to false
    if curl_stderr:
        error_msg = curl_stderr

    time_millis = time.time() - start_time

    # empty quotes evaluates to false
    if curl_stdout:
        if return_only_http_code:
            return (int(curl_stdout), error_msg, time_millis)
        else:
            return (curl_stdout, error_msg, time_millis)

    logger.debug("The curl response for %s is empty; standard error = %s",
                 caller_label, str(error_msg))

    return ("", error_msg, time_millis)
Exemplo n.º 60
0
    def rebalancehdfs(self, env):
        import params
        env.set_params(params)

        name_node_parameters = json.loads(params.name_node_params)
        threshold = name_node_parameters['threshold']
        _print("Starting balancer with threshold = %s\n" % threshold)

        rebalance_env = {'PATH': params.hadoop_bin_dir}

        if params.security_enabled:
            # Create the kerberos credentials cache (ccache) file and set it in the environment to use
            # when executing HDFS rebalance command. Use the md5 hash of the combination of the principal and keytab file
            # to generate a (relatively) unique cache filename so that we can use it as needed.
            # TODO: params.tmp_dir=/var/lib/ambari-agent/tmp. However hdfs user doesn't have access to this path.
            # TODO: Hence using /tmp
            ccache_file_name = "hdfs_rebalance_cc_" + _md5(
                format(
                    "{hdfs_principal_name}|{hdfs_user_keytab}")).hexdigest()
            ccache_file_path = os.path.join(tempfile.gettempdir(),
                                            ccache_file_name)
            rebalance_env['KRB5CCNAME'] = ccache_file_path

            # If there are no tickets in the cache or they are expired, perform a kinit, else use what
            # is in the cache
            klist_cmd = format("{klist_path_local} -s {ccache_file_path}")
            kinit_cmd = format(
                "{kinit_path_local} -c {ccache_file_path} -kt {hdfs_user_keytab} {hdfs_principal_name}"
            )
            if shell.call(klist_cmd, user=params.hdfs_user)[0] != 0:
                Execute(kinit_cmd, user=params.hdfs_user)

        def calculateCompletePercent(first, current):
            # avoid division by zero
            try:
                division_result = current.bytesLeftToMove / first.bytesLeftToMove
            except ZeroDivisionError:
                Logger.warning(
                    "Division by zero. Bytes Left To Move = {0}. Return 1.0".
                    format(first.bytesLeftToMove))
                return 1.0
            return 1.0 - division_result

        def startRebalancingProcess(threshold, rebalance_env):
            rebalanceCommand = format(
                'hdfs --config {hadoop_conf_dir} balancer -threshold {threshold}'
            )
            return as_user(rebalanceCommand,
                           params.hdfs_user,
                           env=rebalance_env)

        command = startRebalancingProcess(threshold, rebalance_env)

        basedir = os.path.join(env.config.basedir, 'scripts')
        if (threshold == 'DEBUG'):  #FIXME TODO remove this on PROD
            basedir = os.path.join(env.config.basedir, 'scripts',
                                   'balancer-emulator')
            command = ['ambari-python-wrap', 'hdfs-command.py']

        _print("Executing command %s\n" % command)

        parser = hdfs_rebalance.HdfsParser()

        def handle_new_line(line, is_stderr):
            if is_stderr:
                return

            _print('[balancer] %s' % (line))
            pl = parser.parseLine(line)
            if pl:
                res = pl.toJson()
                res['completePercent'] = calculateCompletePercent(
                    parser.initialLine, pl)

                self.put_structured_out(res)
            elif parser.state == 'PROCESS_FINISED':
                _print('[balancer] %s' % ('Process is finished'))
                self.put_structured_out({'completePercent': 1})
                return

        Execute(
            command,
            on_new_line=handle_new_line,
            logoutput=False,
        )

        if params.security_enabled:
            # Delete the kerberos credentials cache (ccache) file
            File(
                ccache_file_path,
                action="delete",
            )