def _check_decommission(cluster, instances, check_func, timeout): s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: statuses = check_func(cluster) dec_ok = True for instance in instances: if statuses[instance.fqdn()] != 'decommissioned': dec_ok = False if dec_ok: return else: context.sleep(5) else: ex.DecommissionError( _("Cannot finish decommission of cluster %(cluster)s in " "%(seconds)d seconds") % { "cluster": cluster, "seconds": timeout })
def decommission_dn(nn, inst_to_be_deleted, survived_inst): with remote.get_remote(nn) as r: r.write_file_to('/etc/hadoop/dn.excl', utils.generate_fqdn_host_names(inst_to_be_deleted)) run.refresh_nodes(remote.get_remote(nn), "dfsadmin") context.sleep(3) timeout = config_helper.get_decommissioning_timeout( nn.node_group.cluster) s_time = timeutils.utcnow() all_found = False while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: cmd = r.execute_command( "sudo su -c 'hadoop dfsadmin -report' hadoop") all_found = True datanodes_info = parse_dfs_report(cmd[1]) for i in inst_to_be_deleted: for dn in datanodes_info: if (dn["Name"].startswith(i.internal_ip)) and ( dn["Decommission Status"] != "Decommissioned"): all_found = False break if all_found: r.write_files_to({ '/etc/hadoop/dn.incl': utils.generate_fqdn_host_names(survived_inst), '/etc/hadoop/dn.excl': "", }) break context.sleep(3) if not all_found: ex.DecommissionError( _("Cannot finish decommission of cluster %(cluster)s in " "%(seconds)d seconds") % { "cluster": nn.node_group.cluster, "seconds": timeout })
def decommission_cluster_instances(self, cluster, clusterspec, instances, ambari_info): request_uri = self._get_command_request_uri(ambari_info, cluster.name) hosts_to_decommission = [] # Decommission HDFS datanodes to avoid loss of data # during decommissioning process for instance in instances: ng_name = instance.node_group.name if "DATANODE" in clusterspec.node_groups[ng_name].components: # determine the instances that include HDFS support hosts_to_decommission.append(instance.fqdn()) LOG.debug('AmbariClient: hosts_to_decommission = ' + str(hosts_to_decommission)) # template for request body body_header = ('{"RequestInfo" : { "context": "Decommission DataNode",' ' "command" : "DECOMMISSION", "service_name" : "HDFS",' ' "component_name" : "NAMENODE", ' ' "parameters" : { "slave_type" : "DATANODE", ') excluded_hosts_request = '"excluded_hosts" : "{0}"' # generate comma-separated list of hosts to de-commission list_of_hosts = ",".join(hosts_to_decommission) LOG.debug('AmbariClient: list_of_hosts = ' + list_of_hosts) # create the request body request_body = ( body_header + excluded_hosts_request.format(list_of_hosts) + '}}' + ', "Requests/resource_filters":[{"service_name":"HDFS",' '"component_name":"NAMENODE"}]}') LOG.debug('AmbariClient: about to make decommission request, uri = ' + request_uri) LOG.debug('AmbariClient: about to make decommission request, ' + 'request body = ' + request_body) # ask Ambari to decommission the datanodes result = self._post(request_uri, ambari_info, request_body) if result.status_code != 202: LOG.error( _LE('AmbariClient: error while making decommission post ' 'request. Error is = %s'), result.text) raise ex.DecommissionError( _('An error occurred while trying to ' 'decommission the DataNode instances that are ' 'being shut down. ' 'Please consult the Ambari server logs on the ' 'master node for ' 'more information about the failure.')) else: LOG.info(_LI('AmbariClient: decommission post request succeeded!')) status_template = ('http://{0}/api/v1/clusters/{1}/hosts/{2}/' 'host_components/{3}') # find the host that the NameNode is deployed on name_node_host = clusterspec.determine_component_hosts( 'NAMENODE').pop() status_request = status_template.format(ambari_info.get_address(), cluster.name, name_node_host.fqdn(), 'NAMENODE') LOG.debug('AmbariClient: about to make decommission status request,' + 'uri = ' + status_request) count = 0 while count < 100 and len(hosts_to_decommission) > 0: LOG.info( _LI('AmbariClient: number of hosts waiting for ' 'decommissioning to complete = %s'), str(len(hosts_to_decommission))) result = self._get(status_request, ambari_info) if result.status_code != 200: LOG.error( _LE('AmbariClient: error in making decommission ' 'status request, error = %s'), result.text) else: LOG.info( _LI('AmbariClient: decommission status request ok, ' 'result = %s'), result.text) json_result = json.loads(result.text) live_nodes = ( json_result['metrics']['dfs']['namenode']['LiveNodes']) # parse out the map of live hosts associated with the NameNode json_result_nodes = json.loads(live_nodes) for node, val in six.iteritems(json_result_nodes): admin_state = val['adminState'] if admin_state == 'Decommissioned': LOG.info( _LI('AmbariClient: node = %(node)s is ' 'now in adminState = %(admin_state)s'), { 'node': node, 'admin_state': admin_state }) # remove from list, to track which nodes # are now in Decommissioned state hosts_to_decommission.remove(node) LOG.info(_LI('AmbariClient: sleeping for 5 seconds')) context.sleep(5) # increment loop counter count += 1 if len(hosts_to_decommission) > 0: LOG.error( _LE('AmbariClient: decommissioning process timed-out ' 'waiting for nodes to enter "Decommissioned" ' 'status.'))