class ImpalaCluster(object): def __init__(self, cm_host, cm_cluster_name, username, password): self.cm_api = ApiResource(cm_host, username=username, password=password) self.hosts = dict() self.services = list() self.cluster = self.cm_api.get_cluster(cm_cluster_name) if self.cluster is None: raise RuntimeError, 'Cluster name "%s" not found' % cm_cluster_name self.__load_hosts() self.__impala_service = ImpalaService(self) def _get_all_services(self): return self.cluster.get_all_services() def get_impala_service(self): return self.__impala_service def __load_hosts(self): self.hosts = dict() # Search for all hosts that are in the target cluster. # There is no API that provides the list of host in a given cluster, so to find them # we must loop through all the hosts and check the cluster name matches. for host_info in self.cm_api.get_all_hosts(): # host_info doesn't include a link to the roleRef so need to do another lookup # based on the hostId. host = self.cm_api.get_host(host_info.hostId) for roleRef.get('clusterName') == self.cluster_name: self.hosts[host_info.hostId] = Host(host) break
def get_cluster_info(manager_host, server_port=7180, username='******', password='******'): cm_api = ApiResource(manager_host, username=username, password=password, server_port=server_port, version=9) host = list(cm_api.get_all_hosts())[0] # all hosts same instance type cluster = list(cm_api.get_all_clusters())[0] yarn = filter(lambda x: x.type == 'YARN', list(cluster.get_all_services()))[0] hive = filter(lambda x: x.type == 'HIVE', list(cluster.get_all_services()))[0] impala = filter(lambda x: x.type == 'IMPALA', list(cluster.get_all_services()))[0] hive_hs2 = hive.get_roles_by_type('HIVESERVER2')[0] hive_host = cm_api.get_host(hive_hs2.hostRef.hostId).hostname hive_port = int( hive_hs2.get_config('full')['hs2_thrift_address_port'].default) impala_hs2 = impala.get_roles_by_type('IMPALAD')[0] impala_host = cm_api.get_host(impala_hs2.hostRef.hostId).hostname impala_port = int(impala_hs2.get_config('full')['hs2_port'].default) return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')), 'node_cores': host.numCores, 'node_memory': host.totalPhysMemBytes, 'hive_host': hive_host, 'hive_port': hive_port, 'impala_host': impala_host, 'impala_port': impala_port}
class cm_utils(object): def __init__(self,service,role,host,list): self.service = service.lower() self.role = role.lower() self.host = host.lower() self.list = list.lower() cm_host = '10.7.177.234' self.api = ApiResource(cm_host, username="******", password="******") # "ALL" if service == "None" else service # "ALL" if role == "None" else role # "ALL" if host == "None" else host def main(self): # s_filter = None for c in self.api.get_all_clusters(): print c for s in c.get_all_services(): print "SERVICE : " + s.displayName + "===============" # if (s.displayName.lower() == self.service) or (self.service == "all"): if ( self.service in s.displayName.lower() ) or (self.service == "all"): s_filter = s for r in s_filter.get_all_roles(): # print "ROLE : " + r.type + "================" if (self.role in r.type.lower()) or (self.role == "all"): h = r.hostRef.hostId hostname,ipAddress,healthSummary = self._get_host_info(h) if (self.host in hostname) or (self.host in ipAddress) or (self.host in h) or (self.host == "all"): if self.list == "yes": print ipAddress else: print "[" + r.type + "]" + hostname + " " + ipAddress + " " + healthSummary def _get_host_info(self,hostid): host = self.api.get_host(hostid) # self.hostname = host.hostname # self.host_ip = host.ipAddress # self.host_status = host.healthSummary return host.hostname,host.ipAddress,host.healthSummary
def main(cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name, cm_tls_enabled, cm_tls_cafile): #print cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name, cm_tls_enabled, cm_tls_cafile if cm_tls_enabled == 'false': api = ApiResource(server_host=cm_fqhn, username=cm_user_name, password=cm_user_password) else: #context = ssl.create_default_context(cafile='/opt/cloudera/security/certs/ChainedCA.cert.pem') context = ssl.create_default_context(cafile=cm_tls_cafile) api = ApiResource(server_host=cm_fqhn, username=cm_user_name, password=cm_user_password, use_tls=True, ssl_context=context) # Get a list of all clusters cdh_cluster = None for c in api.get_all_clusters(): if c.name == cm_cluster_name: print '\nCluster:', c cdh_cluster = c for x in cdh_cluster.list_hosts(): HOST_NAME2ID_MAP[api.get_host(x.hostId).hostname] = x.hostId HOST_ID2NAME_MAP[x.hostId] = api.get_host(x.hostId).hostname print '\nHostName to HostId Mapping:' for x in HOST_NAME2ID_MAP: print x, HOST_NAME2ID_MAP[x] print '\nHostId to HostName Mapping:' for x in HOST_ID2NAME_MAP: print x, HOST_ID2NAME_MAP[x] print '\nServices:' for x in cdh_cluster.get_all_services(): print x.type #ZooKeeper #zk_client_port = getKeyValueByServiceTypeAndRoleType(cdh_cluster, # SERVICE_TYPE_MAP['zookeeper'], # SERVICE_ROLE_TYPE_MAP['zookeeper'], # 'clientPort'); zk_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['zookeeper']) zk_server_rcg = getRCGByServiceAndRoleType( zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server']) zk_client_port = geValueByKeyInRCG( zk_server_rcg, CONFIG_PROPERTY_MAP['zk_client_port']) if zk_client_port != None: CONFIG_KEY_VALUE_MAP['ZOOKEEPER_PORT'] = zk_client_port zk_hosts = getHostsByServiceAndRoleType( zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server']) #print 'ZOOKEEPER HOSTS:', zk_hosts if len(zk_hosts) > 0: CONFIG_KEY_VALUE_MAP['ZOOKEEPER_QUORUM'] = ' '.join(zk_hosts) #HDFS hdfs_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['hdfs']) hdfs_nn_rcg = getRCGByServiceAndRoleType( hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode']) #inspectKVsInRCG(hdfs_nn_rcg) hdfs_nn_ns = geValueByKeyInRCG(hdfs_nn_rcg, CONFIG_PROPERTY_MAP['hdf_nn_ns']) #print 'HDFS NAMENODE NAMESERVICE:', hdfs_nn_ns hdfs_nn_port = geValueByKeyInRCG( hdfs_nn_rcg, CONFIG_PROPERTY_MAP['hdf_nn_port']) #print 'HDFS NAMENODE PORT:', hdfs_nn_port if hdfs_nn_port == None: hdfs_nn_port = CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT'] else: CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT'] = hdfs_nn_port nn_hosts = None if hdfs_nn_ns == None: nn_hosts = getHostsByServiceAndRoleType( hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode']) #print 'HDFS NAMENODE HOSTS:', nn_hosts CONFIG_KEY_VALUE_MAP[ 'NAME_NODE'] = 'hdfs://' + nn_hosts[0] + ':' + hdfs_nn_port else: CONFIG_KEY_VALUE_MAP['NAME_NODE'] = hdfs_nn_ns #YARN yarn_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['yarn']) #inspectRolesByService(yarn_service) #inspectRCGs(yarn_service) yarn_jt_rcg = getRCGByServiceAndRoleType( yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager']) #inspectKVsInRCG(yarn_jt_rcg) yarn_rm_address = geValueByKeyInRCG( yarn_jt_rcg, CONFIG_PROPERTY_MAP['yarn_rm_address']) if yarn_rm_address == None: yarn_rm_address = CONFIG_KEY_VALUE_MAP[ 'RESOURCEMANAGER_ADDRESS'] else: CONFIG_KEY_VALUE_MAP[ 'RESOURCEMANAGER_ADDRESS'] = yarn_rm_address rm_hosts = getHostsByServiceAndRoleType( yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager']) #print 'YARN RESOURCEMANGER HOSTS:', rm_hosts CONFIG_KEY_VALUE_MAP[ 'JOB_TRACKER'] = rm_hosts[0] + ':' + yarn_rm_address #OOZIE oozie_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['oozie']) #inspectConfigByService(oozie_service) oozie_use_ssl = getValueByKeyServiceConfig( oozie_service, CONFIG_PROPERTY_MAP['oozie_use_ssl']) #print 'OOZIE TLS/SSL:', oozie_use_ssl if oozie_use_ssl == 'true': CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] = 'true' oozie_LB = getValueByKeyServiceConfig( oozie_service, CONFIG_PROPERTY_MAP['oozie_load_balancer']) #inspectRolesByService(oozie_service) #inspectRCGs(oozie_service) oozie_server_rcg = getRCGByServiceAndRoleType( oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server']) #inspectKVsInRCG(oozie_server_rcg) oozie_http_port = geValueByKeyInRCG( oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_http_port']) oozie_https_port = geValueByKeyInRCG( oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_https_port']) if oozie_http_port == None: oozie_http_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTP_PORT'] if oozie_https_port == None: oozie_https_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTPS_PORT'] #print 'OOOZIE http(s) ports:', oozie_http_port, oozie_https_port oozie_hosts = getHostsByServiceAndRoleType( oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server']) #print oozie_hosts if CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] == 'true': if oozie_LB != None: CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'https://' + oozie_LB else: CONFIG_KEY_VALUE_MAP[ 'OOZIE_URL'] = 'http://' + oozie_hosts[ 0] + ':' + CONFIG_KEY_VALUE_MAP[ 'OOZIE_HTTPS_PORT'] + '/oozie' else: if oozie_LB != None: CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'http://' + oozie_LB else: CONFIG_KEY_VALUE_MAP[ 'OOZIE_URL'] = 'http://' + oozie_hosts[ 0] + ':' + CONFIG_KEY_VALUE_MAP[ 'OOZIE_HTTP_PORT'] + '/oozie' #HBASE hbase_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['hbase']) #inspectConfigByService(hbase_service) #inspectRolesByService(hbase_service) hbase_rs_rcg = getRCGByServiceAndRoleType( hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver']) #inspectKVsInRCG(hbase_rs_rcg) hbase_rs_port = geValueByKeyInRCG( hbase_rs_rcg, CONFIG_PROPERTY_MAP['hbase_rs_port']) if hbase_rs_port != None: CONFIG_KEY_VALUE_MAP['HBASE_REST_PORT'] = hbase_rs_port hbase_rs_hosts = getHostsByServiceAndRoleType( hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver']) CONFIG_KEY_VALUE_MAP['HBASE_REST_IP'] = hbase_rs_hosts[0] #KAFKA kafka_service = getServiceByServiceType(cdh_cluster, SERVICE_TYPE_MAP['kafka']) #inspectConfigByService(kafka_service) #inspectRolesByService(kafka_service) kafka_broker_rcg = getRCGByServiceAndRoleType( kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker']) #inspectKVsInRCG(kafka_broker_rcg) kafka_client_security_protocol = geValueByKeyInRCG( kafka_broker_rcg, CONFIG_PROPERTY_MAP['kafka_client_security_protocol']) if kafka_client_security_protocol != None: CONFIG_KEY_VALUE_MAP[ 'KAFKA_SECURITY_PROTOCOL'] = kafka_client_security_protocol kafka_broker_hosts = getHostsByServiceAndRoleType( kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker']) if len(kafka_broker_hosts) > 0: CONFIG_KEY_VALUE_MAP['KAFKA_BROKER'] = ' '.join(zk_hosts) # Print all print '\nOUTPUT:\n', CONFIG_KEY_VALUE_MAP
hdfs_service = None hdfs_service_role_list = None hdfs_host_list = [] yarn_service = None yarn_service_role_list = None yarn_host_list = [] oozie_service = None oozie_service_role_list = None oozie_host_list = [] for c in api.get_all_clusters(): # if c.name == cluster_name: print c cdh_cluster = c for x in cdh_cluster.list_hosts(): host_name2id_map[api.get_host(x.hostId).hostname] = x.hostId host_id2name_map[x.hostId] = api.get_host(x.hostId).hostname for x in cdh_cluster.get_all_services(): print x.type for x in host_name2id_map: print x, host_name2id_map[x] for x in host_id2name_map: print x, host_id2name_map[x] for s in cdh_cluster.get_all_services(): if s.type == zk_service_type: print 'SERVICE:', s.type, s.get_config() zk_service = s zk_service_role_group_list = zk_service.get_all_role_config_groups() for x in zk_service_role_group_list: print 'ROLE_GROUP:', x.roleType,
class ClouderaManager(object): """ The complete orchestration of a cluster from start to finish assuming all the hosts are configured and Cloudera Manager is installed with all the required databases setup. Handle all the steps required in creating a cluster. All the functions are built to function idempotently. So you should be able to resume from any failed step but running thru the __class__.setup() """ def __init__(self, module, config, trial=False, license_txt=None): self.api = ApiResource(config['cm']['host'], username=config['cm']['username'], password=config['cm']['password']) self.manager = self.api.get_cloudera_manager() self.config = config self.module = module self.trial = trial self.license_txt = license_txt self.cluster = None def enable_license(self): """ Enable the requested license, either it's trial mode or a full license is entered and registered. """ try: _license = self.manager.get_license() except ApiException: print_json(type="LICENSE", msg="Enabling license") if self.trial: self.manager.begin_trial() else: if license_txt is not None: self.manager.update_license(license_txt) else: fail( self.module, 'License should be provided or trial should be specified' ) try: _license = self.manager.get_license() except ApiException: fail(self.module, 'Failed enabling license') print_json(type="LICENSE", msg="Owner: {}, UUID: {}".format(_license.owner, _license.uuid)) def create_cluster(self): """ Create a cluster and add hosts to the cluster. A new cluster is only created if another one doesn't exist with the same name. """ print_json(type="CLUSTER", msg="Creating cluster") cluster_config = self.config['cluster'] try: self.cluster = self.api.get_cluster(cluster_config['name']) except ApiException: print_json(type="CLUSTER", msg="Creating Cluster entity: {}".format( cluster_config['name'])) self.cluster = self.api.create_cluster( cluster_config['name'], cluster_config['version'], cluster_config['fullVersion']) cluster_hosts = [ self.api.get_host(host.hostId).hostname for host in self.cluster.list_hosts() ] hosts = [] for host in cluster_config['hosts']: if host not in cluster_hosts: hosts.append(host) self.cluster.add_hosts(hosts) def activate_parcels(self): print_json(type="PARCELS", msg="Setting up parcels") for parcel_cfg in self.config['parcels']: parcel = Parcels(self.module, self.manager, self.cluster, parcel_cfg.get('version'), parcel_cfg.get('repo'), parcel_cfg.get('product', 'CDH')) parcel.download() parcel.distribute() parcel.activate() @retry(attempts=20, delay=5) def wait_inspect_hosts(self, cmd): """ Inspect all the hosts. Basically wait till the check completes on all hosts. :param cmd: A command instance used for tracking the status of the command """ print_json(type="HOSTS", msg="Inspecting hosts") cmd = cmd.fetch() if cmd.success is None: raise ApiException("Waiting on command {} to finish".format(cmd)) elif not cmd.success: if (cmd.resultMessage is not None and 'is not currently available for execution' in cmd.resultMessage): raise ApiException('Retry Command') fail(self.module, 'Host inspection failed') print_json(type="HOSTS", msg="Host inspection completed: {}".format( cmd.resultMessage)) def deploy_mgmt_services(self): """ Configure, deploy and start all the Cloudera Management Services. """ print_json(type="MGMT", msg="Deploying Management Services") try: mgmt = self.manager.get_service() if mgmt.serviceState == 'STARTED': return except ApiException: print_json(type="MGMT", msg="Management Services don't exist. Creating.") mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo()) for role in config['services']['MGMT']['roles']: if not len(mgmt.get_roles_by_type(role['group'])) > 0: print_json(type="MGMT", msg="Creating role for {}".format(role['group'])) mgmt.create_role('{}-1'.format(role['group']), role['group'], role['hosts'][0]) for role in config['services']['MGMT']['roles']: role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format( role['group'])) role_group.update_config(role.get('config', {})) mgmt.start().wait() if self.manager.get_service().serviceState == 'STARTED': print_json(type="MGMT", msg="Management Services started") else: fail( self.module, "[MGMT] Cloudera Management services didn't start up properly") def service_orchestrate(self, services): """ Create, pre-configure provided list of services Stop/Start those services Perform and post service startup actions :param services: List of Services to perform service specific actions """ service_classes = [] # Create and pre-configure provided services for service in services: service_config = self.config['services'].get(service.upper()) if service_config: svc = getattr(sys.modules[__name__], service)(self.cluster, service_config) if not svc.started: svc.deploy() svc.pre_start() service_classes.append(svc) print_json(type="CLUSTER", msg="Starting services: {} on Cluster".format(services)) # Deploy all the client configs, since some of the services depend on other services # and is essential that the client configs are in place self.cluster.deploy_client_config() # Start each service and run the post_start actions for each service for svc in service_classes: # Only go thru the steps if the service is not yet started. This helps with # re-running the script after fixing errors if not svc.started: svc.start() svc.post_start() def setup(self): # TODO(rnirmal): Cloudera Manager SSL? # Enable a full license or start a trial self.enable_license() # Create the cluster entity and associate hosts self.create_cluster() # Download and activate the parcels self.activate_parcels() # Inspect all the hosts self.wait_inspect_hosts(self.manager.inspect_hosts()) # Create Management services self.deploy_mgmt_services() # Configure and Start base services self.service_orchestrate(BASE_SERVICES) # Configure and Start remaining services self.service_orchestrate(ADDITIONAL_SERVICES)
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '268435456', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_dir_a = module.params.get('jn_dir', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # deploy configuration - it always return changed elif action_a == 'deploy_configuration': service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] service = cluster.get_service(service_name) # deploying client configuration command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) module.exit_json(changed=True, msg='Configuration deployed') # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
cm_host = "localhost" api = ApiResource(cm_host, username="******", password="******") print "*** CLUSTERS ***" clusters = None # List clusters for c in api.get_all_clusters(): print "Cluster \"%s\" is version %s" % (c.name, c.version) clusters = c print "*** HOSTS ***" for host_ref in c.list_hosts(): host = api.get_host(host_ref.hostId) print host.hostname print "*** SERVICES ***" hdfs = None # List services & health info for s in clusters.get_all_services(): print "Service \"%s\" -- state \"%s\" -- health \"%s\"" % ( s.name, s.serviceState, s.healthSummary) # Get HDFS service if 'hdfs' in s.type.lower(): hdfs = s print "*** HDFS Service checks (" + hdfs.serviceUrl + ") ***"
class RemoteDataLoad(object): """This is an implementation of the process to load a test-warehouse snapshot on a remote CM managed cluster. This script assumes that the warehouse snapshot was already downloaded and was either passed in as a parameter, or can be found by either inspecting the SNAPSHOT_DIR environment variable, or based on the WORKSPACE environment variable on a Jenkins build slave. The reason for the additional setup code is that in the local development environment it is assumed that $USER is HDFS superuser, which is not the case for remote deloyments. """ def __init__(self, cm_host, options): logger.info("Starting remote data load...") self.options = options self.cm_host = cm_host # Gateway host can be used if the CM host is not configured as a Hadoop gateway self.gateway = options.gateway if options.gateway else cm_host self.impala_home = os.environ["IMPALA_HOME"] self.api = ApiResource(self.cm_host, username=options.cm_user, password=options.cm_pass) # The API returns a list of clusters managed by the CM host. We're assuming # that this CM host was set up for the purpose of Impala testing on one # cluster, so the list should only have one value. self.cluster = self.api.get_all_clusters()[0] self.services = self.get_services() self.config = self.get_service_client_configurations() logger.info("Retrieved service configuration") logger.info(str(self.config)) self.prepare() logger.info("IMPALA_HOME: {0}".format(self.impala_home)) def get_hostname_for_ref(self, host_ref): """Translate the HostRef instance into the hostname.""" return self.api.get_host(host_ref.hostId).hostname @staticmethod def get_or_default(config): return config.value if config.value else config.default def get_services(self): """Confirm that all services are running, and return service dict.""" services = dict((s.type, s) for s in self.cluster.get_all_services()) if set(REQUIRED_SERVICES) != set(services.keys()): missing_services = set(REQUIRED_SERVICES) - set(services.keys()) logger.error("Services not installed: {0}".format(list(missing_services))) raise RuntimeError("Cluster not ready.") if not all(services[s].serviceState == 'STARTED' for s in services): stopped = [s for s in services if services[s].serviceState != "STARTED"] logger.error("Not all services started: {0}".format(stopped)) raise RuntimeError("Cluster not ready.") return services @timing def download_client_config(self, cluster, service): """Download the client configuration zip for a particular cluster and service. Since cm_api does not provide a way to download the archive we build the URL manually and download the file. Once it downloaded the file the archive is extracted and its content is copied to the Hadoop configuration directories defined by Impala. """ logger.info("Downloading client configuration for {0}".format(service.name)) url = "http://{0}:7180/api/{1}/clusters/{2}/services/{3}/clientConfig".format( self.cm_host, CM_API_VERSION, urlquote(cluster.name), urlquote(service.name)) path = mkdtemp() sh.curl(url, o=os.path.join(path, "clientConfig.zip"), _out=tee, _err=tee) current = os.getcwd() os.chdir(path) sh.unzip("clientConfig.zip") for root, _, file_names in os.walk("."): for filename in fnmatch.filter(file_names, "*.xml"): src = os.path.join(root, filename) dst = os.path.join(self.impala_home, "fe", "src", "test", "resources") logger.debug("Copying {0} to {1}".format(src, dst)) shutil.copy(src, dst) os.chdir(current) # TODO: this may be available in tests/comparison/cluster.py def set_hive_warehouse_dir(self, cluster, service): logger.info("Setting the Hive Warehouse Dir") for service in self.api.get_all_clusters()[0].get_all_services(): logger.info(service) if service.type == "HIVE": hive_config = { "hive_warehouse_directory" : HIVE_WAREHOUSE_DIR } service.update_config(hive_config) # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @timing def get_service_client_configurations(self): """Download the client configurations necessary to upload data to the remote cluster. Unfortunately, the CM API does not allow downloading it so we have to iterate over the services and download the config for all of them. In addition, returns an options dictionary with settings required for data loading like the HS2 server, Impala hosts, Name node etc. Returns: A client-configuration dictionary, e.g.: { 'hive_warehouse_directory': '/test-warehouse', 'hs2': 'impala-test-cluster-1.gce.cloudera.com:10000', 'impalad': ['impala-test-cluster-4.gce.cloudera.com:21000', 'impala-test-cluster-2.gce.cloudera.com:21000', 'impala-test-cluster-3.gce.cloudera.com:21000'], 'metastore': 'impala-test-cluster-1.gce.cloudera.com:9083', 'namenode': 'impala-test-cluster-1.gce.cloudera.com', 'namenode_http': 'impala-test-cluster-1.gce.cloudera.com:20101', 'kudu_master': 'impala-test-cluster-1.gce.cloudera.com' } """ # Iterate overs services and find the information we need result = {} for service_type, service in self.services.iteritems(): if service_type == "IMPALA": roles = service.get_roles_by_type("IMPALAD") impalads = [] for r in roles: rc_config = r.get_config("full") hostname = self.get_hostname_for_ref(r.hostRef) hs2_port = self.get_or_default(rc_config["beeswax_port"]) impalads.append("{0}:{1}".format(hostname, hs2_port)) result["impalad"] = impalads elif service_type == "HBASE": self.download_client_config(self.cluster, service) elif service_type == "HDFS": self.download_client_config(self.cluster, service) role = service.get_roles_by_type("NAMENODE") config = role[0].get_config("full") namenode = self.get_hostname_for_ref(role[0].hostRef) result["namenode"] = namenode result["namenode_http"] = "{0}:{1}".format( namenode, self.get_or_default(config["dfs_http_port"]) ) elif service_type == "HIVE": self.set_hive_warehouse_dir(self.cluster, service) self.download_client_config(self.cluster, service) hs2 = service.get_roles_by_type("HIVESERVER2")[0] rc_config = hs2.get_config("full") result["hive_warehouse_directory"] = self.get_or_default( service.get_config("full")[0]["hive_warehouse_directory"]) hostname = self.get_hostname_for_ref(hs2.hostRef) result["hs2"] = "{0}:{1}".format(hostname, self.get_or_default( rc_config["hs2_thrift_address_port"])) # Get Metastore information ms = service.get_roles_by_type("HIVEMETASTORE")[0] rc_config = ms.get_config("full") result["metastore"] = "{0}:{1}".format( self.get_hostname_for_ref(ms.hostRef), self.get_or_default(rc_config["hive_metastore_port"]) ) elif service_type == "KUDU": # Service KUDU does not require a client configuration result["kudu_master"] = self.cm_host return result # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @staticmethod def find_snapshot_file(snapshot_dir): """Given snapshot_directory, walks the directory tree until it finds a file matching the test-warehouse archive pattern.""" for root, _, file_names in os.walk(snapshot_dir): for filename in fnmatch.filter(file_names, "test-warehouse-*-SNAPSHOT.tar.gz"): logger.info("Found Snapshot file {0}".format(filename)) return os.path.join(root, filename) @timing def prepare(self): """Populate the environment of the process with the necessary values. In addition, it creates helper objects to run shell and SSH processes. """ # Populate environment with required variables os.environ["HS2_HOST_PORT"] = self.config["hs2"] os.environ["HDFS_NN"] = self.config["namenode"] os.environ["IMPALAD"] = self.config["impalad"][0] os.environ["REMOTE_LOAD"] = "1" os.environ["HADOOP_USER_NAME"] = "hdfs" os.environ["TEST_WAREHOUSE_DIR"] = self.config["hive_warehouse_directory"] os.environ["KUDU_MASTER"] = self.config["kudu_master"] if self.options.snapshot_file is None: if "SNAPSHOT_DIR" in os.environ: snapshot_dir = os.environ["SNAPSHOT_DIR"] else: snapshot_dir = "{0}/testdata/test-warehouse-SNAPSHOT".format( os.getenv("WORKSPACE")) if not os.path.isdir(snapshot_dir): err_msg = 'Snapshot directory "{0}" is not a valid directory' logger.error(err_msg.format(snapshot_dir)) raise OSError("Could not find test-warehouse snapshot file.") logger.info("Snapshot directory: {0}".format(snapshot_dir)) self.snapshot_file = self.find_snapshot_file(snapshot_dir) else: self.snapshot_file = self.options.snapshot_file # Prepare shortcuts for connecting to remote services self.gtw_ssh = ssh.bake("{0}@{1}".format(self.options.ssh_user, self.gateway), "-oStrictHostKeyChecking=no", "-oUserKnownHostsFile=/dev/null", t=True, _out=tee, _err=tee) self.beeline = sh.beeline.bake(silent=False, outputformat="csv2", n="impala", u="jdbc:hive2://{0}/default".format( self.config["hs2"])) self.load_test_warehouse = sh.Command( "{0}/testdata/bin/load-test-warehouse-snapshot.sh".format( self.impala_home)).bake( _out=tee, _err=tee) self.create_load_data = sh.Command( "{0}/testdata/bin/create-load-data.sh".format(self.impala_home)) self.main_impalad = self.config["impalad"][0] self.impala_shell = sh.Command("impala-shell.sh").bake(i=self.main_impalad, _out=tee, _err=tee) self.python = sh.Command("impala-python").bake(u=True) self.compute_stats = sh.Command( "{0}/testdata/bin/compute-table-stats.sh".format(self.impala_home)).bake( _out=tee, _err=tee) @timing def load(self): """This method performs the actual data load. First it removes any known artifacts from the remote location. Next it drops potentially existing database from the Hive Metastore. Now, it invokes the load-test-warehouse-snapshot.sh and create-load-data.sh scripts with the appropriate parameters. The most important paramters are implicitly passed to the scripts as environment variables pointing to the remote HDFS, Hive and Impala. """ exploration_strategy = self.options.exploration_strategy logger.info("Removing other databases") dblist = self.beeline(e="show databases;", _err=tee).stdout database_list = dblist.split()[1:] # The first element is the header string for db in database_list: if db.strip() != "default": logger.debug("Dropping database %s", db) self.impala_shell(q="drop database if exists {0} cascade;".format(db)) logger.info("Invalidating metadata in Impala") self.impala_shell(q="invalidate metadata;") logger.info("Removing previous remote {0}".format( self.config["hive_warehouse_directory"])) r = sh.hdfs.dfs("-rm", "-r", "-f", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Expunging HDFS trash") r = sh.hdfs.dfs("-expunge") logger.info("Uploading test warehouse snapshot") self.load_test_warehouse(self.snapshot_file) # TODO: We need to confirm that if we change any permissions, that we don't # affect any running tests. See IMPALA-4375. logger.info("Changing warehouse ownership") r = sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format( self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "-R", "g+rwx", "{0}".format( self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "1777", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Calling create_load_data.sh") # The $USER variable is used in the create-load-data.sh script for beeline # impersonation. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" # Regardless of whether we are in fact skipping the snapshot load or not, # we nonetheless always pass -skip_snapshot_load to create-load-data.sh. # This is because we have already loaded the snapshot earlier in this # script, so we don't want create-load-data.sh to invoke # load-test-warehouse-snapshot.sh again. # # It would actually be nice to be able to skip the snapshot load, but # because of the existing messiness of create-load-data.sh, we can't. # This invocation... # # $ create-load-data.sh -skip_snapshot_load -exploration_strategy core # # ...results in this error: # # Creating /test-warehouse HDFS directory \ # (logging to create-test-warehouse-dir.log)... FAILED # 'hadoop fs -mkdir /test-warehouse' failed. Tail of log: # Log for command 'hadoop fs -mkdir /test-warehouse' # mkdir: `/test-warehouse': File exists # # Similarly, even though we might pass in "core" as the exploration strategy, # because we aren't loading a metadata snapshot (i.e., -skip_metadata_load is # false), an exhaustive dataload will always be done. This again is the result # of logic in create-load-data.sh, which itself ignores the value passed in # for -exploration_strategy. # # See IMPALA-4399: "create-load-data.sh has bitrotted to some extent, and needs # to be cleaned up" create_load_data_args = ["-skip_snapshot_load", "-cm_host", self.cm_host, "-snapshot_file", self.snapshot_file, "-exploration_strategy", exploration_strategy] self.create_load_data(*create_load_data_args, _env=new_env, _out=tee, _err=tee) sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format( self.config["hive_warehouse_directory"])) logger.info("Re-load HBase data") # Manually load the HBase data last. self.python("{0}/bin/load-data.py".format(self.impala_home), "--hive_warehouse_dir={0}".format( self.config["hive_warehouse_directory"]), "--table_formats=hbase/none", "--hive_hs2_hostport={0}".format(self.config["hs2"]), "--hdfs_namenode={0}".format(self.config["namenode"]), "--exploration_strategy={0}".format(exploration_strategy), workloads="functional-query", force=True, impalad=self.main_impalad, _env=new_env, _out=tee, _err=tee) self.compute_stats() logger.info("Load data finished") # TODO: Should this be refactored out of this script? It has nothing to do with # data loading per se. If tests rely on the environment on the client being set # a certain way -- as in the prepare() method -- we may need to find another way # to deal with that. See IMPALA-4376. @timing def test(self): """Execute Impala's end-to-end tests against a remote cluster. All configuration paramters are picked from the cluster configuration that was fetched via the CM API.""" # TODO: Running tests via runtest.py is currently not working against a remote # cluster (although running directly via py.test seems to work.) This method # may be refactored out of this file under IMPALA-4376, so for the time being, # raise a NotImplementedError. raise NotImplementedError # Overwrite the username to match the service user on the remote system and deal # with the assumption that in the local development environment the current user # is HDFS superuser as well. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" strategy = self.options.exploration_strategy logger.info("Running tests with exploration strategy {0}".format(strategy)) run_tests = sh.Command("{0}/tests/run-tests.py".format(self.impala_home)) run_tests("--skip_local_tests", "--exploration_strategy={0}".format(strategy), "--workload_exploration_strategy=functional-query:{0}".format(strategy), "--namenode_http_address={0}".format(self.config["namenode_http"]), "--hive_server2={0}".format(self.config["hs2"]), "--metastore_server={0}".format(self.config["metastore"]), "query_test", maxfail=10, impalad=",".join(self.config["impalad"]), _env=new_env, _out=tee, _err=tee)
class RemoteDataLoad(object): """This is an implementation of the process to load a test-warehouse snapshot on a remote CM managed cluster. This script assumes that the warehouse snapshot was already downloaded and was either passed in as a parameter, or can be found by either inspecting the SNAPSHOT_DIR environment variable, or based on the WORKSPACE environment variable on a Jenkins build slave. The reason for the additional setup code is that in the local development environment it is assumed that $USER is HDFS superuser, which is not the case for remote deloyments. """ def __init__(self, cm_host, options): logger.info("Starting remote data load...") self.options = options self.cm_host = cm_host # Gateway host can be used if the CM host is not configured as a Hadoop gateway self.gateway = options.gateway if options.gateway else cm_host self.impala_home = os.environ["IMPALA_HOME"] self.api = ApiResource(self.cm_host, username=options.cm_user, password=options.cm_pass) # The API returns a list of clusters managed by the CM host. We're assuming # that this CM host was set up for the purpose of Impala testing on one # cluster, so the list should only have one value. self.cluster = self.api.get_all_clusters()[0] self.services = self.get_services() self.config = self.get_service_client_configurations() logger.info("Retrieved service configuration") logger.info(str(self.config)) self.prepare() logger.info("IMPALA_HOME: {0}".format(self.impala_home)) def get_hostname_for_ref(self, host_ref): """Translate the HostRef instance into the hostname.""" return self.api.get_host(host_ref.hostId).hostname @staticmethod def get_or_default(config): return config.value if config.value else config.default def get_services(self): """Confirm that all services are running, and return service dict.""" services = dict((s.type, s) for s in self.cluster.get_all_services()) if set(REQUIRED_SERVICES) != set(services.keys()): missing_services = set(REQUIRED_SERVICES) - set(services.keys()) logger.error("Services not installed: {0}".format( list(missing_services))) raise RuntimeError("Cluster not ready.") if not all(services[s].serviceState == 'STARTED' for s in services): stopped = [ s for s in services if services[s].serviceState != "STARTED" ] logger.error("Not all services started: {0}".format(stopped)) raise RuntimeError("Cluster not ready.") return services @timing def download_client_config(self, cluster, service): """Download the client configuration zip for a particular cluster and service. Since cm_api does not provide a way to download the archive we build the URL manually and download the file. Once it downloaded the file the archive is extracted and its content is copied to the Hadoop configuration directories defined by Impala. """ logger.info("Downloading client configuration for {0}".format( service.name)) url = "http://{0}:7180/api/{1}/clusters/{2}/services/{3}/clientConfig".format( self.cm_host, CM_API_VERSION, urlquote(cluster.name), urlquote(service.name)) path = mkdtemp() sh.curl(url, o=os.path.join(path, "clientConfig.zip"), _out=tee, _err=tee) current = os.getcwd() os.chdir(path) sh.unzip("clientConfig.zip") for root, _, file_names in os.walk("."): for filename in fnmatch.filter(file_names, "*.xml"): src = os.path.join(root, filename) dst = os.path.join(self.impala_home, "fe", "src", "test", "resources") logger.debug("Copying {0} to {1}".format(src, dst)) shutil.copy(src, dst) os.chdir(current) # TODO: this may be available in tests/comparison/cluster.py def set_hive_warehouse_dir(self, cluster, service): logger.info("Setting the Hive Warehouse Dir") for service in self.api.get_all_clusters()[0].get_all_services(): logger.info(service) if service.type == "HIVE": hive_config = {"hive_warehouse_directory": HIVE_WAREHOUSE_DIR} service.update_config(hive_config) # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @timing def get_service_client_configurations(self): """Download the client configurations necessary to upload data to the remote cluster. Unfortunately, the CM API does not allow downloading it so we have to iterate over the services and download the config for all of them. In addition, returns an options dictionary with settings required for data loading like the HS2 server, Impala hosts, Name node etc. Returns: A client-configuration dictionary, e.g.: { 'hive_warehouse_directory': '/test-warehouse', 'hs2': 'impala-test-cluster-1.gce.cloudera.com:10000', 'impalad': ['impala-test-cluster-4.gce.cloudera.com:21000', 'impala-test-cluster-2.gce.cloudera.com:21000', 'impala-test-cluster-3.gce.cloudera.com:21000'], 'metastore': 'impala-test-cluster-1.gce.cloudera.com:9083', 'namenode': 'impala-test-cluster-1.gce.cloudera.com', 'namenode_http': 'impala-test-cluster-1.gce.cloudera.com:20101', 'kudu_master': 'impala-test-cluster-1.gce.cloudera.com' } """ # Iterate overs services and find the information we need result = {} for service_type, service in self.services.iteritems(): if service_type == "IMPALA": roles = service.get_roles_by_type("IMPALAD") impalads = [] for r in roles: rc_config = r.get_config("full") hostname = self.get_hostname_for_ref(r.hostRef) hs2_port = self.get_or_default(rc_config["beeswax_port"]) impalads.append("{0}:{1}".format(hostname, hs2_port)) result["impalad"] = impalads elif service_type == "HBASE": self.download_client_config(self.cluster, service) elif service_type == "HDFS": self.download_client_config(self.cluster, service) role = service.get_roles_by_type("NAMENODE") config = role[0].get_config("full") namenode = self.get_hostname_for_ref(role[0].hostRef) result["namenode"] = namenode result["namenode_http"] = "{0}:{1}".format( namenode, self.get_or_default(config["dfs_http_port"])) elif service_type == "HIVE": self.set_hive_warehouse_dir(self.cluster, service) self.download_client_config(self.cluster, service) hs2 = service.get_roles_by_type("HIVESERVER2")[0] rc_config = hs2.get_config("full") result["hive_warehouse_directory"] = self.get_or_default( service.get_config("full")[0]["hive_warehouse_directory"]) hostname = self.get_hostname_for_ref(hs2.hostRef) result["hs2"] = "{0}:{1}".format( hostname, self.get_or_default(rc_config["hs2_thrift_address_port"])) # Get Metastore information ms = service.get_roles_by_type("HIVEMETASTORE")[0] rc_config = ms.get_config("full") result["metastore"] = "{0}:{1}".format( self.get_hostname_for_ref(ms.hostRef), self.get_or_default(rc_config["hive_metastore_port"])) elif service_type == "KUDU": # Service KUDU does not require a client configuration result["kudu_master"] = self.cm_host return result # TODO: This functionality should be more generally available to other infrastructure # code, rather than being quarantined in this script. See IMPALA-4367. @staticmethod def find_snapshot_file(snapshot_dir): """Given snapshot_directory, walks the directory tree until it finds a file matching the test-warehouse archive pattern.""" for root, _, file_names in os.walk(snapshot_dir): for filename in fnmatch.filter(file_names, "test-warehouse-*-SNAPSHOT.tar.gz"): logger.info("Found Snapshot file {0}".format(filename)) return os.path.join(root, filename) @timing def prepare(self): """Populate the environment of the process with the necessary values. In addition, it creates helper objects to run shell and SSH processes. """ # Populate environment with required variables os.environ["HS2_HOST_PORT"] = self.config["hs2"] os.environ["HDFS_NN"] = self.config["namenode"] os.environ["IMPALAD"] = self.config["impalad"][0] os.environ["REMOTE_LOAD"] = "1" os.environ["HADOOP_USER_NAME"] = "hdfs" os.environ["TEST_WAREHOUSE_DIR"] = self.config[ "hive_warehouse_directory"] os.environ["KUDU_MASTER"] = self.config["kudu_master"] if self.options.snapshot_file is None: if "SNAPSHOT_DIR" in os.environ: snapshot_dir = os.environ["SNAPSHOT_DIR"] else: snapshot_dir = "{0}/testdata/test-warehouse-SNAPSHOT".format( os.getenv("WORKSPACE")) if not os.path.isdir(snapshot_dir): err_msg = 'Snapshot directory "{0}" is not a valid directory' logger.error(err_msg.format(snapshot_dir)) raise OSError("Could not find test-warehouse snapshot file.") logger.info("Snapshot directory: {0}".format(snapshot_dir)) self.snapshot_file = self.find_snapshot_file(snapshot_dir) else: self.snapshot_file = self.options.snapshot_file # Prepare shortcuts for connecting to remote services self.gtw_ssh = ssh.bake("{0}@{1}".format(self.options.ssh_user, self.gateway), "-oStrictHostKeyChecking=no", "-oUserKnownHostsFile=/dev/null", t=True, _out=tee, _err=tee) self.beeline = sh.beeline.bake(silent=False, outputformat="csv2", n="impala", u="jdbc:hive2://{0}/default".format( self.config["hs2"])) self.load_test_warehouse = sh.Command( "{0}/testdata/bin/load-test-warehouse-snapshot.sh".format( self.impala_home)).bake(_out=tee, _err=tee) self.create_load_data = sh.Command( "{0}/testdata/bin/create-load-data.sh".format(self.impala_home)) self.main_impalad = self.config["impalad"][0] self.impala_shell = sh.Command("impala-shell.sh").bake( i=self.main_impalad, _out=tee, _err=tee) self.python = sh.Command("impala-python").bake(u=True) self.compute_stats = sh.Command( "{0}/testdata/bin/compute-table-stats.sh".format( self.impala_home)).bake(_out=tee, _err=tee) @timing def load(self): """This method performs the actual data load. First it removes any known artifacts from the remote location. Next it drops potentially existing database from the Hive Metastore. Now, it invokes the load-test-warehouse-snapshot.sh and create-load-data.sh scripts with the appropriate parameters. The most important paramters are implicitly passed to the scripts as environment variables pointing to the remote HDFS, Hive and Impala. """ exploration_strategy = self.options.exploration_strategy logger.info("Removing other databases") dblist = self.beeline(e="show databases;", _err=tee).stdout database_list = dblist.split()[ 1:] # The first element is the header string for db in database_list: if db.strip() != "default": logger.debug("Dropping database %s", db) self.impala_shell( q="drop database if exists {0} cascade;".format(db)) logger.info("Invalidating metadata in Impala") self.impala_shell(q="invalidate metadata;") logger.info("Removing previous remote {0}".format( self.config["hive_warehouse_directory"])) r = sh.hdfs.dfs("-rm", "-r", "-f", "{0}".format(self.config["hive_warehouse_directory"])) logger.info("Expunging HDFS trash") r = sh.hdfs.dfs("-expunge") logger.info("Uploading test warehouse snapshot") self.load_test_warehouse(self.snapshot_file) # TODO: We need to confirm that if we change any permissions, that we don't # affect any running tests. See IMPALA-4375. logger.info("Changing warehouse ownership") r = sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format(self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "-R", "g+rwx", "{0}".format(self.config["hive_warehouse_directory"])) sh.hdfs.dfs("-chmod", "1777", "{0}".format(self.config["hive_warehouse_directory"])) logger.info("Calling create_load_data.sh") # The $USER variable is used in the create-load-data.sh script for beeline # impersonation. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" # Regardless of whether we are in fact skipping the snapshot load or not, # we nonetheless always pass -skip_snapshot_load to create-load-data.sh. # This is because we have already loaded the snapshot earlier in this # script, so we don't want create-load-data.sh to invoke # load-test-warehouse-snapshot.sh again. # # It would actually be nice to be able to skip the snapshot load, but # because of the existing messiness of create-load-data.sh, we can't. # This invocation... # # $ create-load-data.sh -skip_snapshot_load -exploration_strategy core # # ...results in this error: # # Creating /test-warehouse HDFS directory \ # (logging to create-test-warehouse-dir.log)... FAILED # 'hadoop fs -mkdir /test-warehouse' failed. Tail of log: # Log for command 'hadoop fs -mkdir /test-warehouse' # mkdir: `/test-warehouse': File exists # # Similarly, even though we might pass in "core" as the exploration strategy, # because we aren't loading a metadata snapshot (i.e., -skip_metadata_load is # false), an exhaustive dataload will always be done. This again is the result # of logic in create-load-data.sh, which itself ignores the value passed in # for -exploration_strategy. # # See IMPALA-4399: "create-load-data.sh has bitrotted to some extent, and needs # to be cleaned up" create_load_data_args = [ "-skip_snapshot_load", "-cm_host", self.cm_host, "-snapshot_file", self.snapshot_file, "-exploration_strategy", exploration_strategy ] self.create_load_data(*create_load_data_args, _env=new_env, _out=tee, _err=tee) sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format(self.config["hive_warehouse_directory"])) logger.info("Re-load HBase data") # Manually load the HBase data last. self.python("{0}/bin/load-data.py".format(self.impala_home), "--hive_warehouse_dir={0}".format( self.config["hive_warehouse_directory"]), "--table_formats=hbase/none", "--hive_hs2_hostport={0}".format(self.config["hs2"]), "--hdfs_namenode={0}".format(self.config["namenode"]), "--exploration_strategy={0}".format(exploration_strategy), workloads="functional-query", force=True, impalad=self.main_impalad, _env=new_env, _out=tee, _err=tee) self.compute_stats() logger.info("Load data finished") # TODO: Should this be refactored out of this script? It has nothing to do with # data loading per se. If tests rely on the environment on the client being set # a certain way -- as in the prepare() method -- we may need to find another way # to deal with that. See IMPALA-4376. @timing def test(self): """Execute Impala's end-to-end tests against a remote cluster. All configuration paramters are picked from the cluster configuration that was fetched via the CM API.""" # TODO: Running tests via runtest.py is currently not working against a remote # cluster (although running directly via py.test seems to work.) This method # may be refactored out of this file under IMPALA-4376, so for the time being, # raise a NotImplementedError. raise NotImplementedError # Overwrite the username to match the service user on the remote system and deal # with the assumption that in the local development environment the current user # is HDFS superuser as well. new_env = os.environ.copy() new_env["LOGNAME"] = "impala" new_env["USER"] = "******" new_env["USERNAME"] = "******" strategy = self.options.exploration_strategy logger.info( "Running tests with exploration strategy {0}".format(strategy)) run_tests = sh.Command("{0}/tests/run-tests.py".format( self.impala_home)) run_tests( "--skip_local_tests", "--exploration_strategy={0}".format(strategy), "--workload_exploration_strategy=functional-query:{0}".format( strategy), "--namenode_http_address={0}".format(self.config["namenode_http"]), "--hive_server2={0}".format(self.config["hs2"]), "--metastore_server={0}".format(self.config["metastore"]), "query_test", maxfail=10, impalad=",".join(self.config["impalad"]), _env=new_env, _out=tee, _err=tee)
addHost=cluster.add_hosts(newHostList) //Waiting for 5 minutes so that the parcels get downloaded & distributed & activated print "++Wait Time++ 300 seconds" time.sleep(300) if __name__ == '__main__': api = ApiResource(clouderaManagerHost, clouderaManagerPort, clouderaManagerUserName, clouderaManagerPassword, use_tls=clouderaManagerHTTPS) cluster = api.get_cluster(clusterDisplayName) hostlist=[] for hostName in api.get_all_hosts(): if hostName.hostname in newHosts: host = api.get_host(hostName.hostId) hostlist.append(host.hostId) addHost=addHostToCluster(api,cluster,hostlist) start_time=time.time() parcel=cluster.get_parcel('CDH',parcelVersion) //Check for parcel deployment errors. print "++ Checking Parcel Deployement" while True: if parcel.stage == 'ACTIVATED': print "CDH Parcels Activated" break if parcel.state.errors: raise Exception(str(parcel.state.errors)) print parcel.stage print "progress: %s / %s" % (parcel.state.progress, parcel.state.totalProgress)
def main(): module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS)) api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10) cluster_name = CLUSTER_NAME manager = api.get_cloudera_manager() action_a = module.params.get('action', None) if action_a == 'create_cluster': license_a = module.params.get('license', None) version_a = module.params.get('version', None) cluster_list = [x.name for x in api.get_all_clusters()] if cluster_name in cluster_list: module.exit_json(changed=False, msg='Cluster exists') else: cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a) if license_a == None: manager.begin_trial() else: manager.update_license(license_a.decode('base64')) module.exit_json(changed=True, msg='Cluster created') elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']: # more complicated actions that need a created cluster go here cluster = api.get_cluster(cluster_name) host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts()) # adds a host to the cluster # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal if action_a == 'add_host': host_a = module.params.get('host', None) host_list = host_map.keys() if host_a in host_list: module.exit_json(changed=False, msg='Host already in cluster') else: try: cluster.add_hosts([host_a]) except ApiException: # if a host isn't there, it could be because the agent didn't manage to connect yet # so let's wait a moment for it sleep(120) cluster.add_hosts([host_a]) module.exit_json(changed=True, msg='Host added') # create management service and set it's basic configuration # this needs a separate function since management is handled # differently than the rest of services elif action_a == 'create_mgmt': host_a = module.params.get('host', None) # getting the management service is the only way to check if mgmt exists # an exception means there isn't one try: mgmt = manager.get_service() module.exit_json(changed=False, msg='Mgmt service already exists') except ApiException: pass mgmt = manager.create_mgmt_service(ApiServiceSetupInfo()) # this is ugly... and I see no good way to unuglify it firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n") # since there is no easy way of configuring the manager... let's do it here :( role_conf = defaultdict(dict) role_conf['ACTIVITYMONITOR'] = { 'firehose_database_host': '{0}:7432'.format(host_a), 'firehose_database_user': '******', 'firehose_database_password': firehose_passwd, 'firehose_database_type': 'postgresql', 'firehose_database_name': 'amon', 'firehose_heapsize': '268435456', } role_conf['EVENTSERVER'] = { 'event_server_heapsize': '215964392' } role_conf['REPORTSMANAGER'] = { 'headlamp_database_host': '{0}:7432'.format(host_a), 'headlamp_database_user': '******', 'headlamp_database_password': reports_passwd, 'headlamp_database_type': 'postgresql', 'headlamp_database_name': 'rman', 'headlamp_heapsize': '215964392', } roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER'] # create mangement roles for role in roles: mgmt.create_role('{0}-1'.format(role), role, host_map[host_a]) # update configuration of each for group in mgmt.get_all_role_config_groups(): group.update_config(role_conf[group.roleType]) mgmt.start().wait() # after starting this service needs time to spin up sleep(30) module.exit_json(changed=True, msg='Mgmt created and started') # deploy a given parcel on all hosts in the cluster # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4 elif action_a == 'deploy_parcel': name_a = module.params.get('name', None) version_a = module.params.get('version', None) if "latest" in version_a: available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a] if "-latest" in version_a: version_substr = match('(.+?)-latest', version_a).group(1) # if version is just "latest", try to check everything else: version_substr = ".*" try: [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None] except ValueError: module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions)) else: version_parcel = version_a # we now go through various stages of getting the parcel # as there is no built-in way of waiting for an operation to complete # we use loops with sleep to get it done parcel = cluster.get_parcel(name_a, version_parcel) if parcel.stage == 'AVAILABLE_REMOTELY': parcel.start_download() while parcel.stage != 'DOWNLOADED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) sleep(10) if parcel.stage == 'DOWNLOADED': parcel.start_distribution() while parcel.stage != 'DISTRIBUTED': parcel = cluster.get_parcel(name_a, version_parcel) if parcel.state.errors: raise Exception(str(parcel.state.errors)) # sleep while hosts report problems after the download for i in range(12): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break # since parcels are distributed automatically when a new host is added to a cluster # we can encounter the ,,ACTIVATING'' stage then if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING': if parcel.stage == 'DISTRIBUTED': parcel.activate() while parcel.stage != 'ACTIVATED': parcel = cluster.get_parcel(name_a, version_parcel) # this sleep has to be large because although the operation is very fast # it makes the management and cloudera hosts go bonkers, failing all of the health checks sleep(10) # sleep while hosts report problems after the distribution for i in range(60): sleep(10) if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0: break module.exit_json(changed=True, msg='Parcel activated') if parcel.stage == 'ACTIVATED': module.exit_json(changed=False, msg='Parcel already activated') # if we get down here, something is not right module.fail_json(msg='Invalid parcel state') # deploy nodes for workers, according to SERVICE_WORKER_MAP # also give them sane names and init zookeeper and kafka ones # which need id's specified elif action_a == 'deploy_service_worker_nodes': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] role_name = SERVICE_WORKER_MAP[service_a]['name'] full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring'] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) nodes = [x for x in service.get_all_roles() if role_name in x.name] # if host already has the given group, we should skip it if host_map[host_a] in [x.hostRef.hostId for x in nodes]: module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name)) # find out the highest id that currently exists else: node_names = [x.name for x in nodes] if len(node_names) == 0: # if no nodes, start numbering from 1 node_i = 1 else: # take the max number and add 1 to it node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1 if service_name == 'ZOOKEEPER': role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a) # zookeeper needs a per-node ID in the configuration, so we set it now role.update_config({'serverId': node_i}) elif service_name == 'KAFKA': role = service.create_role(full_role_name.format(node_i), role_name, host_a) # kafka needs a per-node ID in the configuration, so we set it now role.update_config({'broker.id': node_i}) else: service.create_role(full_role_name.format(node_i), role_name, host_a) module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name)) # deploy a service. just create it, don't do anything more # this is needed maily when we have to set service properties before role deployment elif action_a == 'deploy_service': name_a = module.params.get('name', None) if not name_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(name_a)) service_name = SERVICE_MAP[name_a] if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) module.exit_json(changed=True, msg='{0} service created'.format(service_name)) else: module.exit_json(changed=False, msg='{0} service already exists'.format(service_name)) # deploy the base hdfs roles (the namenode and secondary) # this doesn't create the service, as at least one datanode should already be added! # the format also requires certain properties to be set before we run it elif action_a == 'deploy_hdfs_base': nn_host_a = module.params.get('nn_host', None) sn_host_a = module.params.get('sn_host', None) changed = False hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't create a secondary namenode when: #- there is one that already exists #- there is a second namenode, which means we have HA and don't need a secondary if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles: hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a) changed = True # create a namenode and format it's FS # formating the namenode requires at least one datanode and secondary namenode already in the cluster! if not 'HDFS-NAMENODE' in hdfs_roles: hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a) for command in hdfs.format_hdfs('HDFS-NAMENODE'): if command.wait().success == False: module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage)) changed = True module.exit_json(changed=changed, msg='Created HDFS service & NN roles') # enable HttpFS for HDFS # HUE require this for support HA in HDFS elif action_a == 'deploy_hdfs_httpfs': host_a = module.params.get('host', None) hdfs = cluster.get_service('HDFS') hdfs_roles = [x.name for x in hdfs.get_all_roles()] # don't install second instance of HttpFS if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0: module.exit_json(changed=False, msg='HDFS HttpFS service already exists') hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) module.exit_json(changed=True, msg='HDFS HttpFS service created') # enable HA for HDFS # this deletes the secondary namenode and creates a second namenode in it's place # also, this spawns 3 journal node and 2 failover controller roles elif action_a == 'deploy_hdfs_ha': sn_host_a = module.params.get('sn_host', None) jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)] hdfs = cluster.get_service('HDFS') # if there's a second namenode, this means we already have HA enabled if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]: # this is bad and I should feel bad # jns is a list of dictionaries, each dict passes the required journalnode parameters jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)] # this call is so long because we set some predictable names for the sevices command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER', active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2') children = command.wait().children for command_children in children: # The format command is expected to fail, since we already formated the namenode if command_children.name != 'Format' and command.success == False: module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for HDFS service') else: module.exit_json(changed=False, msg='HDFS HA already enabled') # enable HA for YARN elif action_a == 'deploy_rm_ha': sn_host_a = module.params.get('sn_host', None) yarn = cluster.get_service('YARN') # if there are two roles matching to this name, this means HA for YARN is enabled if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1: command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER') children = command.wait().children for command_children in children: if command.success == False: module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage)) module.exit_json(changed=True, msg='Enabled HA for YARN service') else: module.exit_json(changed=False, msg='YARN HA already enabled') # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP elif action_a == 'deploy_base_roles': host_a = module.params.get('host', None) service_a = module.params.get('service', None) service_name = SERVICE_MAP[service_a] changed = False if not service_name in [x.name for x in cluster.get_all_services()]: service = cluster.create_service(service_name, service_name) else: service = cluster.get_service(service_name) service_roles = [x.name for x in service.get_all_roles()] # create each service from the map for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items(): # check if role already exists, script cant compare it directly # after enabling HA on YARN roles will have random strings in names if len([0 for x in service_roles if match(role_name, x) != None]) == 0: service.create_role(role_name, cloudera_name, host_a) changed = True # init commmands if role_name in SERVICE_INIT_COMMANDS.keys(): for command_to_run in SERVICE_INIT_COMMANDS[role_name]: # different handling of commands specified by name and # ones specified by an instance method if ismethod(command_to_run): command = command_to_run(service) else: command = service.service_command_by_name(command_to_run) if command.wait().success == False: module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage)) if changed == True: module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name)) else: module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name)) # set config values for a given service/role elif action_a == 'set_config': entity_a = module.params.get('entity', None) service_a = module.params.get('service', None) role_a = module.params.get('role', None) name_a = module.params.get('name', None) value_a = module.params.get('value', None) if not service_a in SERVICE_MAP: module.fail_json(msg='Unknown service: {0}'.format(service_a)) # since management is handled differently, it needs a different service if service_a == 'management': service = manager.get_service() elif service_a == 'cm': service = manager else: service = cluster.get_service(SERVICE_MAP[service_a]) # role and service configs are handled differently if entity_a == 'service': prev_config = service.get_config() curr_config = service.update_config({name_a: value_a}) if service_a == 'cm': prev_config = [prev_config] curr_config = [curr_config] module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a])) elif entity_a == 'role': if not role_a in ROLE_MAP: module.fail_json(msg='Unknown role: {0}'.format(service)) role = service.get_role_config_group(ROLE_MAP[role_a]) prev_config = role.get_config() curr_config = role.update_config({name_a: value_a}) module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a])) else: module.fail_json(msg='Invalid entity, must be one of service, role') # handle service state # currently this only can start/restart a service elif action_a == 'service': state_a = module.params.get('state', None) service_a = module.params.get('service', None) try: if service_a == 'cm': service = manager.get_service() else: service = cluster.get_service(SERVICE_MAP[service_a]) except ApiException: module.fail_json(msg='Service does not exist') # when starting a service, we also deploy the client config for it if state_a == 'started': if service.serviceState == 'STARTED': module.exit_json(changed=False, msg='Service already running') method = service.start verb = "start" elif state_a == 'restarted': method = service.restart verb = "restart" try: command = service.deploy_client_config() if command.wait().success == False: module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage)) # since there is no way to check if a service handles client config deployments # we try our best and pass the exception if it doesn't except ApiException, AttributeError: pass method().wait() # we need to wait for cloudera checks to complete... # otherwise it will report as failing sleep(10) for i in range(24): sleep(10) service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': break service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a]) if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD': module.exit_json(changed=True, msg='Service {0} successful'.format(verb)) else: module.fail_json(msg='Service {0} failed'.format(verb)) # handle cluster # currently this only can restart elif action_a == 'cluster': state_a = module.params.get('state', None) if state_a == 'restarted': command = cluster.restart(redeploy_client_configuration=True) if command.wait().success == False: module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage)) else: module.exit_json(changed=True, msg='Cluster restart successful') # Snapshot policy # only create is supported elif action_a == 'create_snapshot_policy': name_a = module.params.get('name', None) value_a = module.params.get('value', None) service_a = module.params.get('service', None) service = cluster.get_service(SERVICE_MAP[service_a]) payload=loads(value_a) # checking if policy already exists. Exception is expected when configure for the first time. try: test = service.get_snapshot_policy(name_a) module.exit_json(changed=False, msg='Defined policy already exists') except ApiException: pass try: command = service.create_snapshot_policy(payload) module.exit_json(changed=True, msg='Snapshot policy was created.') except ApiException, AttributeError: module.fail_json(msg='ERROR in creating snapshot policy.')
def api_data_collection(request): """ Application information collection restful api. Query completed application information on specific conditions and accumulate it. @method: GET @param from_time: Application finish time after this time. format : "%d/%m/%Y %H:%M". time zone=UTC+8 @param end_time: Application finish time before this time. format : "%d/%m/%Y %H:%M". time zone=UTC+8 @param queue_name: Query completed application information on specific queue name. @param app_type: Query completed application information on specific application type. @param app_state: Query completed application information on specific application states. specified as a comma-separated list. ex: FINISHED,FAILED,KILLED @return: json data { "success":False, "message":"error message" } { "success":True, "message": { "queue_view":{...}, "group_view":{...} } } """ if request.method == "GET": response = {'success':False, 'message':''} filter_dict = {} if "queue_name" in request.GET: filter_dict['queue_name'] = request.GET.get('queue_name') if "app_type" in request.GET: filter_dict['app_type'] = request.GET.get('app_type') if "app_state" in request.GET: filter_dict['app_state'] = request.GET.get('app_state').split(',') # # time zone = Asia/Taipei = UTC+8 from_time = datetime.strptime(request.GET.get('from_time'), "%d/%m/%Y %H:%M") - timedelta(hours=8) to_time = datetime.strptime(request.GET.get('end_time'), "%d/%m/%Y %H:%M") - timedelta(hours=8) # # get config config = ConfigParser.ConfigParser() config.read( os.path.join(settings.BASE_DIR, "cluster.ini") ) cm_host = config.get("CM", "cm.host") cm_port = config.get("CM", "cm.port") cm_version = config.get("CM", "cm.version") cm_username = config.get("CM", "cm.username") cm_password = config.get("CM", "cm.password") # cluster_name = config.get("Cluster", "cluster.name") yarn_name = config.get("Cluster", "cluster.yarn.name") # ldap_host = config.get("Ldap", "ldap.host") ldap_username = config.get("Ldap", "ldap.username") ldap_password = config.get("Ldap", "ldap.password") # # get active resource manager info try: cm_api = ApiResource( cm_host, int(cm_port), username=cm_username, password=cm_password, version=int(cm_version) ) cm_cluster_obj = cm_api.get_cluster(name=cluster_name) cm_yarn_obj = cm_cluster_obj.get_service(name=yarn_name) # find_active_rm = False for rm in cm_yarn_obj.get_roles_by_type(role_type="RESOURCEMANAGER"): if rm.haStatus == "ACTIVE": host = cm_api.get_host(rm.hostRef.hostId) active_rm_ip = host.ipAddress active_rm_port = 8088 find_active_rm = True # if not find_active_rm: message = "can not find active rm" print( "[ERROR] " + message ) response['success'] = False response['message'] = message return HttpResponse( json.dumps(response) ) except Exception, e: message = "can not get cm yarn object" print( "[ERROR] " + message + str(e) ) response['success'] = False response['message'] = message return HttpResponse( json.dumps(response) ) # # all application statistics statistics_response = applications_statistics(active_rm_ip, active_rm_port, from_time, to_time, filter_dict) if statistics_response['success']: # # create ldap connection. access ldap to get group of account if create_ldap_connection(ldap_host, ldap_username, ldap_password): ldap_connection = create_ldap_connection(ldap_host, ldap_username, ldap_password) else: message = "can not connect to ldap://" + ldap_host response['success'] = False response['message'] = message return HttpResponse( json.dumps(response) ) # # init queue view result & group view result queue_view_final_result = statistics_response['message'] group_view_final_result = {} # # # add group information to queue view result and accumulate the result by group for queue, queue_info in queue_view_final_result.items(): # queue_view_final_result[queue]['group'] = '' # queue naming : root.SYSTEM.<account> , root.PERSONAL.<account> m = re.match(r"(?P<root>\w+)\.(?P<second>\w+)\.(?P<third>\w+)", queue) if m and m.group('root') == 'root' and ( m.group('second') == 'SYSTEM' or m.group('second') == 'PERSONAL' ): queue_view_final_result[queue]['account'] = m.group('third') group_query_result = query_group_of_user(ldap_connection, queue_view_final_result[queue]['account']) group = group_query_result['group'] project_name = group_query_result['name'] queue_view_final_result[queue]['group'] = group if not group_view_final_result.has_key(group): group_view_final_result[group] = { 'apps':{}, 'queues':[], 'name':project_name } group_view_final_result[group]['queues'].append(queue) # for app_type, app_info in queue_info['apps'].items(): for app_state, data in app_info['final_status'].items(): if not group_view_final_result[group]['apps'].has_key(app_state): group_view_final_result[group]['apps'][app_state] = {} for key in data: if not group_view_final_result[group]['apps'][app_state].has_key(key): group_view_final_result[group]['apps'][app_state][key] = data[key] else: group_view_final_result[group]['apps'][app_state][key] += data[key] # # after finish to accumulate all result, unbind ldap connection ldap_connection.unbind() else: response['success'] = False response['message'] = statistics_response['message'] return HttpResponse( json.dumps(response) ) # # transform duration type from datetime.timedelta to string queue_view_final_result = transform_queue_view_response(queue_view_final_result) group_view_final_result = transform_project_view_response(group_view_final_result) # response['success'] = True response['message'] = {} response['message']['queue_view'] = queue_view_final_result response['message']['group_view'] = group_view_final_result print json.dumps("[DEBUG] response = " + json.dumps(response)) return HttpResponse( json.dumps(response) )
class ClouderaManager(object): """ The complete orchestration of a cluster from start to finish assuming all the hosts are configured and Cloudera Manager is installed with all the required databases setup. Handle all the steps required in creating a cluster. All the functions are built to function idempotently. So you should be able to resume from any failed step but running thru the __class__.setup() """ def __init__(self, module, config, trial=False, license_txt=None): self.api = ApiResource(config['cm']['host'], username=config['cm']['username'], password=config['cm']['password']) self.manager = self.api.get_cloudera_manager() self.config = config self.module = module self.trial = trial self.license_txt = license_txt self.cluster = None def enable_license(self): """ Enable the requested license, either it's trial mode or a full license is entered and registered. """ try: _license = self.manager.get_license() except ApiException: print_json(type="LICENSE", msg="Enabling license") if self.trial: self.manager.begin_trial() else: if license_txt is not None: self.manager.update_license(license_txt) else: fail(self.module, 'License should be provided or trial should be specified') try: _license = self.manager.get_license() except ApiException: fail(self.module, 'Failed enabling license') print_json(type="LICENSE", msg="Owner: {}, UUID: {}".format(_license.owner, _license.uuid)) def create_cluster(self): """ Create a cluster and add hosts to the cluster. A new cluster is only created if another one doesn't exist with the same name. """ print_json(type="CLUSTER", msg="Creating cluster") cluster_config = self.config['cluster'] try: self.cluster = self.api.get_cluster(cluster_config['name']) except ApiException: print_json(type="CLUSTER", msg="Creating Cluster entity: {}".format(cluster_config['name'])) self.cluster = self.api.create_cluster(cluster_config['name'], cluster_config['version'], cluster_config['fullVersion']) cluster_hosts = [self.api.get_host(host.hostId).hostname for host in self.cluster.list_hosts()] hosts = [] for host in cluster_config['hosts']: if host not in cluster_hosts: hosts.append(host) self.cluster.add_hosts(hosts) def activate_parcels(self): print_json(type="PARCELS", msg="Setting up parcels") for parcel_cfg in self.config['parcels']: parcel = Parcels(self.module, self.manager, self.cluster, parcel_cfg.get('version'), parcel_cfg.get('repo'), parcel_cfg.get('product', 'CDH')) parcel.download() parcel.distribute() parcel.activate() @retry(attempts=20, delay=5) def wait_inspect_hosts(self, cmd): """ Inspect all the hosts. Basically wait till the check completes on all hosts. :param cmd: A command instance used for tracking the status of the command """ print_json(type="HOSTS", msg="Inspecting hosts") cmd = cmd.fetch() if cmd.success is None: raise ApiException("Waiting on command {} to finish".format(cmd)) elif not cmd.success: if (cmd.resultMessage is not None and 'is not currently available for execution' in cmd.resultMessage): raise ApiException('Retry Command') fail(self.module, 'Host inspection failed') print_json(type="HOSTS", msg="Host inspection completed: {}".format(cmd.resultMessage)) def deploy_mgmt_services(self): """ Configure, deploy and start all the Cloudera Management Services. """ print_json(type="MGMT", msg="Deploying Management Services") try: mgmt = self.manager.get_service() if mgmt.serviceState == 'STARTED': return except ApiException: print_json(type="MGMT", msg="Management Services don't exist. Creating.") mgmt = self.manager.create_mgmt_service(ApiServiceSetupInfo()) for role in config['services']['MGMT']['roles']: if not len(mgmt.get_roles_by_type(role['group'])) > 0: print_json(type="MGMT", msg="Creating role for {}".format(role['group'])) mgmt.create_role('{}-1'.format(role['group']), role['group'], role['hosts'][0]) for role in config['services']['MGMT']['roles']: role_group = mgmt.get_role_config_group('mgmt-{}-BASE'.format(role['group'])) role_group.update_config(role.get('config', {})) mgmt.start().wait() if self.manager.get_service().serviceState == 'STARTED': print_json(type="MGMT", msg="Management Services started") else: fail(self.module, "[MGMT] Cloudera Management services didn't start up properly") def service_orchestrate(self, services): """ Create, pre-configure provided list of services Stop/Start those services Perform and post service startup actions :param services: List of Services to perform service specific actions """ service_classes = [] # Create and pre-configure provided services for service in services: service_config = self.config['services'].get(service.upper()) if service_config: svc = getattr(sys.modules[__name__], service)(self.cluster, service_config) if not svc.started: svc.deploy() svc.pre_start() service_classes.append(svc) print_json(type="CLUSTER", msg="Starting services: {} on Cluster".format(services)) # Deploy all the client configs, since some of the services depend on other services # and is essential that the client configs are in place self.cluster.deploy_client_config() # Start each service and run the post_start actions for each service for svc in service_classes: # Only go thru the steps if the service is not yet started. This helps with # re-running the script after fixing errors if not svc.started: svc.start() svc.post_start() def setup(self): # TODO(rnirmal): Cloudera Manager SSL? # Enable a full license or start a trial self.enable_license() # Create the cluster entity and associate hosts self.create_cluster() # Download and activate the parcels self.activate_parcels() # Inspect all the hosts self.wait_inspect_hosts(self.manager.inspect_hosts()) # Create Management services self.deploy_mgmt_services() # Configure and Start base services self.service_orchestrate(BASE_SERVICES) # Configure and Start remaining services self.service_orchestrate(ADDITIONAL_SERVICES)
class CmCluster(Cluster): def __init__(self, host_name, port=None, user="******", password="******", cluster_name=None, ssh_user=None, ssh_port=None, ssh_key_file=None, use_tls=False): # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently # something in the CM API uses strptime(). strptime("2015", "%Y") Cluster.__init__(self) # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls if port is None: if use_tls: port = CM_TLS_PORT else: port = CM_CLEAR_PORT self.cm = CmApiResource(host_name, server_port=port, username=user, password=password, use_tls=use_tls) clusters = self.cm.get_all_clusters() if not clusters: raise Exception("No clusters found in CM at %s" % host_name) if cluster_name: clusters_by_name = dict((c.name, c) for c in clusters) if cluster_name not in clusters_by_name: raise Exception(("No clusters named %s found in CM at %s." "Available clusters are %s.") % (cluster_name, host_name, ", ".join(sorted(clusters_by_name.keys())))) self.cm_cluster = clusters_by_name[cluster_name] else: if len(clusters) > 1: raise Exception(("Too many clusters found in CM at %s;" " a cluster name must be provided") % host_name) self.cm_cluster = clusters[-1] self.ssh_user = ssh_user self.ssh_port = ssh_port self.ssh_key_file = ssh_key_file self._ssh_client_lock = Lock() self._ssh_clients_by_host_name = defaultdict(list) def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT): with self._ssh_client(host_name) as client: return client.shell(cmd, timeout_secs=timeout_secs) @contextmanager def _ssh_client(self, host_name): """Returns an SSH client for use in a 'with' block. When the 'with' context exits, the client will be kept for reuse. """ with self._ssh_client_lock: clients = self._ssh_clients_by_host_name[host_name] if clients: client = clients.pop() else: # IMPALA-7460: Insulate this import away from the global context so as to avoid # requiring Paramiko unless it's absolutely needed. from tests.util.ssh_util import SshClient LOG.debug("Creating new SSH client for %s", host_name) client = SshClient() client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file) error_occurred = False try: yield client except Exception: error_occurred = True raise finally: if not error_occurred: with self._ssh_client_lock: self._ssh_clients_by_host_name[host_name].append(client) def _init_local_hadoop_conf_dir(self): self._local_hadoop_conf_dir = mkdtemp() data = StringIO(self.cm.get("/clusters/%s/services/%s/clientConfig" % (self.cm_cluster.name, self._find_service("HIVE").name))) zip_file = ZipFile(data) for name in zip_file.namelist(): if name.endswith("/"): continue extract_path = os.path.join(self._local_hadoop_conf_dir, os.path.basename(name)) with open(extract_path, "w") as conf_file: conf_file.write(zip_file.open(name).read()) def _find_service(self, service_type): """Find a service by its CM API service type. An exception will be raised if no service is found or multiple services are found. See the CM API documentation for more details about the service type. """ services = [s for s in self.cm_cluster.get_all_services() if s.type == service_type] if not services: raise Exception("No service of type %s found in cluster %s" % (service_type, self.cm_cluster.name)) if len(services) > 1: raise Exception("Found %s services in cluster %s; only one is expected." % len(services, self.cm_cluster.name)) return services[0] def _find_role(self, role_type, service_type): """Find a role by its CM API role and service type. An exception will be raised if no roles are found. See the CM API documentation for more details about the service and role types. """ service = self._find_service(service_type) roles = service.get_roles_by_type(role_type) if not roles: raise Exception("No roles of type %s found in service %s" % (role_type, service.name)) return roles[0] def _init_hdfs(self): self._hdfs = Hdfs(self, "hdfs") def _init_hive(self): hs2 = self._find_role("HIVESERVER2", "HIVE") host = self.cm.get_host(hs2.hostRef.hostId) config = hs2.get_config(view="full")["hs2_thrift_address_port"] self._hive = Hive(self, str(host.hostname), int(config.value or config.default)) def _init_impala(self): self._impala = CmImpala(self, self._find_service("IMPALA"))
cm_host = "localhost" api = ApiResource(cm_host, username="******", password="******") print "*** CLUSTERS ***" clusters = None # List clusters for c in api.get_all_clusters(): print "Cluster \"%s\" is version %s" % (c.name, c.version) clusters = c print "*** HOSTS ***" for host_ref in c.list_hosts(): host = api.get_host(host_ref.hostId) print host.hostname print "*** SERVICES ***" hdfs = None # List services & health info for s in clusters.get_all_services(): print "Service \"%s\" -- state \"%s\" -- health \"%s\"" %(s.name, s.serviceState, s.healthSummary) # Get HDFS service if 'hdfs' in s.type.lower(): hdfs = s print "*** HDFS Service checks (" + hdfs.serviceUrl + ") ***" print "*** ROLES FOR HDFS ***"
class CmCluster(Cluster): def __init__(self, host_name, port=None, user="******", password="******", cluster_name=None, ssh_user=None, ssh_port=None, ssh_key_file=None, use_tls=False): # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently # something in the CM API uses strptime(). strptime("2015", "%Y") Cluster.__init__(self) # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls if port is None: if use_tls: port = CM_TLS_PORT else: port = CM_CLEAR_PORT self.cm = CmApiResource(host_name, server_port=port, username=user, password=password, use_tls=use_tls) clusters = self.cm.get_all_clusters() if not clusters: raise Exception("No clusters found in CM at %s" % host_name) if cluster_name: clusters_by_name = dict((c.name, c) for c in clusters) if cluster_name not in clusters_by_name: raise Exception(("No clusters named %s found in CM at %s." "Available clusters are %s.") % (cluster_name, host_name, ", ".join( sorted(clusters_by_name.keys())))) self.cm_cluster = clusters_by_name[cluster_name] else: if len(clusters) > 1: raise Exception( ("Too many clusters found in CM at %s;" " a cluster name must be provided") % host_name) self.cm_cluster = clusters[-1] self.ssh_user = ssh_user self.ssh_port = ssh_port self.ssh_key_file = ssh_key_file self._ssh_client_lock = Lock() self._ssh_clients_by_host_name = defaultdict(list) def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT): with self._ssh_client(host_name) as client: return client.shell(cmd, timeout_secs=timeout_secs) @contextmanager def _ssh_client(self, host_name): """Returns an SSH client for use in a 'with' block. When the 'with' context exits, the client will be kept for reuse. """ with self._ssh_client_lock: clients = self._ssh_clients_by_host_name[host_name] if clients: client = clients.pop() else: LOG.debug("Creating new SSH client for %s", host_name) client = SshClient() client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file) error_occurred = False try: yield client except Exception: error_occurred = True raise finally: if not error_occurred: with self._ssh_client_lock: self._ssh_clients_by_host_name[host_name].append(client) def _init_local_hadoop_conf_dir(self): self._local_hadoop_conf_dir = mkdtemp() data = StringIO( self.cm.get( "/clusters/%s/services/%s/clientConfig" % (self.cm_cluster.name, self._find_service("HIVE").name))) zip_file = ZipFile(data) for name in zip_file.namelist(): if name.endswith("/"): continue extract_path = os.path.join(self._local_hadoop_conf_dir, os.path.basename(name)) with open(extract_path, "w") as conf_file: conf_file.write(zip_file.open(name).read()) def _find_service(self, service_type): """Find a service by its CM API service type. An exception will be raised if no service is found or multiple services are found. See the CM API documentation for more details about the service type. """ services = [ s for s in self.cm_cluster.get_all_services() if s.type == service_type ] if not services: raise Exception("No service of type %s found in cluster %s" % (service_type, self.cm_cluster.name)) if len(services) > 1: raise Exception( "Found %s services in cluster %s; only one is expected." % len(services, self.cm_cluster.name)) return services[0] def _find_role(self, role_type, service_type): """Find a role by its CM API role and service type. An exception will be raised if no roles are found. See the CM API documentation for more details about the service and role types. """ service = self._find_service(service_type) roles = service.get_roles_by_type(role_type) if not roles: raise Exception("No roles of type %s found in service %s" % (role_type, service.name)) return roles[0] def _init_hdfs(self): self._hdfs = Hdfs(self, "hdfs") def _init_hive(self): hs2 = self._find_role("HIVESERVER2", "HIVE") host = self.cm.get_host(hs2.hostRef.hostId) config = hs2.get_config(view="full")["hs2_thrift_address_port"] self._hive = Hive(self, str(host.hostname), int(config.value or config.default)) def _init_impala(self): self._impala = CmImpala(self, self._find_service("IMPALA"))
class NiagaraCMApi(object): def __init__(self, cm_host, user, password, cluster='cluster', port='7180', version=17): self.cm_host = cm_host self.user = user self.password = password self.cluster = cluster self.port = port self.version = version self.api = ApiResource(server_host=self.cm_host, server_port=self.port, username=self.user, password=self.password, version=self.version) def get_hosts_by_role(self, service_name, role, haStatus=None): """ Method gets all hosts that runs specific service and role. Args: service_name(str): Name of service that runs under cloudera manager. role(str): Role name (f.e. KAFKA_BROKER) Returns: Sorted list of hostnames, that runs specific service and type. """ cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) result = [] for server in service_nodes: if haStatus == 'ACTIVE' and server.haStatus != 'ACTIVE': continue host_reference = server.hostRef.hostId result.append(self.api.get_host(host_reference).hostname) return result def get_kafka_broker_id_by_hostname(self, nodename, role='KAFKA_BROKER', service_name='kafka'): cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) for node in service_nodes: hostname = self.api.get_host(node.hostRef.hostId).hostname if hostname == nodename: broker_id = node.get_config()['broker.id'] return broker_id def get_service_ports(self, service_name, role_config_group): """ Method gets ports of specific type of service. Args: service_name(str): Name of service that runs under cloudera manager. role_config_group(str): Role config group name (f.e. kafka-KAFKA_BROKER-BASE) Returns: ports(dict): Dictionary hostname:port. """ cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) config = service.get_role_config_group(role_config_group) if service_name == 'kafka': try: kafka_port = config.config['port'].value except AttributeError: kafka_port = config.config['port'] return kafka_port elif service_name == 'zookeeper': try: zk_port = config.config['clientPort'].value except AttributeError: zk_port = config.config['clientPort'] return zk_port else: raise ValueError("Unknown service {0}".format(service_name)) def get_all_role_config_groups(self, service_name): """ Method gets all service's role config groups names, that could be used in get_service_ports. Args: service_name(str): Name of service that runs under cloduera manager. Returns: result(dict): Dictionary with all available role config groups names. """ cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) all_role_groups = service.get_all_role_config_groups() result = [] for role_group in all_role_groups: result.append(role_group.name) return result def get_log_dirs_for_kafka_broker(self, nodename, service_name='kafka', role='KAFKA_BROKER'): cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) for node in service_nodes: hostname = self.api.get_host(node.hostRef.hostId).hostname if hostname == nodename: config = node.get_config()['log.dirs'].split(',') return config def get_broker_status(self, nodename, service_name='kafka', role='KAFKA_BROKER'): cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) for node in service_nodes: hostname = self.api.get_host(node.hostRef.hostId).hostname if hostname == nodename: return node.roleState, node.maintenanceMode def kafka_broker_action(self, nodename, action, service_name='kafka', role='KAFKA_BROKER'): cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) for node in service_nodes: hostname = self.api.get_host(node.hostRef.hostId).hostname if hostname == nodename: _, maintenance = self.get_broker_status(nodename=nodename) if not maintenance: if action == 'start': cmd = service.start_roles(node.name) elif action == 'stop': cmd = service.stop_roles(node.name) elif action == 'restart': cmd = service.restart_roles(node.name) else: return 'Unknown action {0}'.format(action) cmd[0].wait() state, _ = self.get_broker_status(nodename=nodename) return state else: return maintenance def edit_log_dir_from_kafka_broker(self, nodename, log_dir, action, service_name='kafka', role='KAFKA_BROKER'): cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) service_nodes = service.get_roles_by_type(role) for node in service_nodes: hostname = self.api.get_host(node.hostRef.hostId).hostname if hostname == nodename: config = node.get_config() try: log_dirs = config['log.dirs'] except KeyError: error = "No log dirs exists." log_dirs = '' if action == 'remove': if log_dir in log_dirs: new_log_dirs = log_dirs.replace(log_dir, '').replace( ',,', ',').strip(',') else: return 0, 'Log dir {0} is not in a config.'.format( log_dir) elif action == 'add': if log_dir not in log_dirs: new_log_dirs = log_dirs + ',' + log_dir.replace( ',,', ',').strip(',') else: return 0, 'Log dir {0} is already in a config.'.format( log_dir) else: return 2, 'Error: unknown action {0}'.format(action) new_config = config new_config['log.dirs'] = new_log_dirs try: node.update_config(new_config) except ApiException as e: return 1, 'Error: {0}'.format(e) else: return 0, 'Broker config updated.' def get_role_types(self, service_name): """ Methos gets all service's role names, that could be used in get_host_by_role method. Args: service_name(str): Name of service that runs under cloduera manager. Returns: all_roles(dict): Dictionary with all available role names. :param service_name: :return: """ cluster = self.api.get_cluster(self.cluster) service = cluster.get_service(service_name) all_roles = service.get_role_types() return all_roles