예제 #1
0
def _check_decommission(cluster, instances, check_func, timeout):
    s_time = timeutils.utcnow()
    while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
        statuses = check_func(cluster)
        dec_ok = True
        for instance in instances:
            if statuses[instance.fqdn()] != 'decommissioned':
                dec_ok = False

        if dec_ok:
            return
        else:
            context.sleep(5)
    else:
        ex.DecommissionError(
            _("Cannot finish decommission of cluster %(cluster)s in "
              "%(seconds)d seconds") % {
                  "cluster": cluster,
                  "seconds": timeout
              })
예제 #2
0
def decommission_dn(nn, inst_to_be_deleted, survived_inst):
    with remote.get_remote(nn) as r:
        r.write_file_to('/etc/hadoop/dn.excl',
                        utils.generate_fqdn_host_names(inst_to_be_deleted))
        run.refresh_nodes(remote.get_remote(nn), "dfsadmin")
        context.sleep(3)

        timeout = config_helper.get_decommissioning_timeout(
            nn.node_group.cluster)
        s_time = timeutils.utcnow()
        all_found = False

        while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout:
            cmd = r.execute_command(
                "sudo su -c 'hadoop dfsadmin -report' hadoop")
            all_found = True
            datanodes_info = parse_dfs_report(cmd[1])
            for i in inst_to_be_deleted:
                for dn in datanodes_info:
                    if (dn["Name"].startswith(i.internal_ip)) and (
                            dn["Decommission Status"] != "Decommissioned"):
                        all_found = False
                        break

            if all_found:
                r.write_files_to({
                    '/etc/hadoop/dn.incl':
                    utils.generate_fqdn_host_names(survived_inst),
                    '/etc/hadoop/dn.excl':
                    "",
                })
                break
            context.sleep(3)

        if not all_found:
            ex.DecommissionError(
                _("Cannot finish decommission of cluster %(cluster)s in "
                  "%(seconds)d seconds") % {
                      "cluster": nn.node_group.cluster,
                      "seconds": timeout
                  })
예제 #3
0
    def decommission_cluster_instances(self, cluster, clusterspec, instances,
                                       ambari_info):

        request_uri = self._get_command_request_uri(ambari_info, cluster.name)

        hosts_to_decommission = []
        # Decommission HDFS datanodes to avoid loss of data
        # during decommissioning process
        for instance in instances:
            ng_name = instance.node_group.name
            if "DATANODE" in clusterspec.node_groups[ng_name].components:
                # determine the instances that include HDFS support
                hosts_to_decommission.append(instance.fqdn())

        LOG.debug('AmbariClient: hosts_to_decommission = ' +
                  str(hosts_to_decommission))

        # template for request body
        body_header = ('{"RequestInfo" : { "context": "Decommission DataNode",'
                       ' "command" : "DECOMMISSION", "service_name" : "HDFS",'
                       ' "component_name" : "NAMENODE", '
                       ' "parameters" : { "slave_type" : "DATANODE", ')

        excluded_hosts_request = '"excluded_hosts" : "{0}"'

        # generate comma-separated list of hosts to de-commission
        list_of_hosts = ",".join(hosts_to_decommission)

        LOG.debug('AmbariClient: list_of_hosts = ' + list_of_hosts)

        # create the request body
        request_body = (
            body_header + excluded_hosts_request.format(list_of_hosts) + '}}' +
            ', "Requests/resource_filters":[{"service_name":"HDFS",'
            '"component_name":"NAMENODE"}]}')

        LOG.debug('AmbariClient: about to make decommission request, uri = ' +
                  request_uri)
        LOG.debug('AmbariClient: about to make decommission request, ' +
                  'request body  = ' + request_body)

        # ask Ambari to decommission the datanodes
        result = self._post(request_uri, ambari_info, request_body)
        if result.status_code != 202:
            LOG.error(
                _LE('AmbariClient: error while making decommission post '
                    'request. Error is = %s'), result.text)
            raise ex.DecommissionError(
                _('An error occurred while trying to '
                  'decommission the DataNode instances that are '
                  'being shut down. '
                  'Please consult the Ambari server logs on the '
                  'master node for '
                  'more information about the failure.'))
        else:
            LOG.info(_LI('AmbariClient: decommission post request succeeded!'))

        status_template = ('http://{0}/api/v1/clusters/{1}/hosts/{2}/'
                           'host_components/{3}')

        # find the host that the NameNode is deployed on
        name_node_host = clusterspec.determine_component_hosts(
            'NAMENODE').pop()
        status_request = status_template.format(ambari_info.get_address(),
                                                cluster.name,
                                                name_node_host.fqdn(),
                                                'NAMENODE')

        LOG.debug('AmbariClient: about to make decommission status request,' +
                  'uri = ' + status_request)

        count = 0
        while count < 100 and len(hosts_to_decommission) > 0:
            LOG.info(
                _LI('AmbariClient: number of hosts waiting for '
                    'decommissioning to complete = %s'),
                str(len(hosts_to_decommission)))

            result = self._get(status_request, ambari_info)
            if result.status_code != 200:
                LOG.error(
                    _LE('AmbariClient: error in making decommission '
                        'status request, error = %s'), result.text)
            else:
                LOG.info(
                    _LI('AmbariClient: decommission status request ok, '
                        'result = %s'), result.text)
                json_result = json.loads(result.text)
                live_nodes = (
                    json_result['metrics']['dfs']['namenode']['LiveNodes'])
                # parse out the map of live hosts associated with the NameNode
                json_result_nodes = json.loads(live_nodes)
                for node, val in six.iteritems(json_result_nodes):
                    admin_state = val['adminState']
                    if admin_state == 'Decommissioned':
                        LOG.info(
                            _LI('AmbariClient: node = %(node)s is '
                                'now in adminState = %(admin_state)s'), {
                                    'node': node,
                                    'admin_state': admin_state
                                })
                        # remove from list, to track which nodes
                        # are now in Decommissioned state
                        hosts_to_decommission.remove(node)

            LOG.info(_LI('AmbariClient: sleeping for 5 seconds'))
            context.sleep(5)

            # increment loop counter
            count += 1

        if len(hosts_to_decommission) > 0:
            LOG.error(
                _LE('AmbariClient: decommissioning process timed-out '
                    'waiting for nodes to enter "Decommissioned" '
                    'status.'))