def main(): # TODO(tmckay): Work on restricting the options # pulled in by imports which show up in the help. # If we find a nice way to do this the calls to # unregister_extra_cli_opt() can be removed CONF(project='sahara') # For some reason, this is necessary to clear cached values # and re-read configs. For instance, if this is not done # here the 'plugins' value will not reflect the value from # the config file on the command line CONF.reload_config_files() log.setup(CONF, "sahara") # If we have to enforce extra option checks, like one option # requires another, do it here extra_option_checks() # Since this may be scripted, record the command in the log # so a user can know exactly what was done LOG.info(_LI("Command: {command}").format(command=' '.join(sys.argv))) api.set_logger(LOG) api.set_conf(CONF) CONF.command.func() LOG.info(_LI("Finished {command}").format(command=CONF.command.name))
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info( _LI("Cluster {cluster_id}: all instances have IPs assigned") .format(cluster_id=cluster.id)) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step( cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster {cluster_id}: all instances are accessible") .format(cluster_id=cluster.id))
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info(_LI("Spark service at '%s' has been started"), sm_instance.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def start_services(self, cluster_name, cluster_spec, ambari_info): start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/' 'state=INSTALLED'.format(ambari_info.get_address(), cluster_name)) body = ('{"RequestInfo" : { "context" : "Start all services" },' '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}') self._fire_service_start_notifications(cluster_name, cluster_spec, ambari_info) result = self._put(start_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request( self._get_async_request_uri(ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info(_LI("Successfully started Hadoop cluster.")) LOG.info( _LI('Ambari server address: {server_address}').format( server_address=ambari_info.get_address())) else: LOG.error(_LE('Failed to start Hadoop cluster.')) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.')) elif result.status_code != 200: LOG.error( _LE('Start command failed. Status: {status}, ' 'response: {response}').format(status=result.status_code, response=result.text)) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.'))
def _await_networks(self, cluster, instances): if not instances: return cpo.add_provisioning_step(cluster.id, _("Assign IPs"), len(instances)) ips_assigned = set() self._ips_assign(ips_assigned, cluster, instances) LOG.info(_LI("All instances have IPs assigned")) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) cpo.add_provisioning_step(cluster.id, _("Wait for instance accessibility"), len(instances)) with context.ThreadGroup() as tg: for instance in instances: with context.set_current_instance_id(instance.instance_id): tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("All instances are accessible"))
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node self._start_namenode(nn_instance) # start the data nodes self._start_datanode_processes(dn_instances) LOG.info( _LI("Hadoop services in cluster {cluster} have been started"). format(cluster=cluster.name)) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes self.start_spark(cluster) LOG.info( _LI('Cluster {cluster} has been started successfully').format( cluster=cluster.name)) self._set_cluster_info(cluster)
def wait_for_host_registrations(self, num_hosts, ambari_info): LOG.info( _LI('Waiting for all Ambari agents to register with server ...')) url = 'http://{0}/api/v1/hosts'.format(ambari_info.get_address()) result = None json_result = None # TODO(jspeidel): timeout while result is None or len(json_result['items']) < num_hosts: context.sleep(5) try: result = self._get(url, ambari_info) json_result = json.loads(result.text) LOG.info(_LI('Registered Hosts: %(current_number)s of ' '%(final_number)s'), {'current_number': len(json_result['items']), 'final_number': num_hosts}) for hosts in json_result['items']: LOG.debug('Registered Host: {0}'.format( hosts['Hosts']['host_name'])) except Exception: # TODO(jspeidel): max wait time LOG.info(_LI('Waiting to connect to ambari server ...'))
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") dn_instances = utils.get_instances(cluster, "datanode") zep_instance = utils.get_instance(cluster, "zeppelin") # Start the name node self._start_namenode(nn_instance) # start the data nodes self._start_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services have been started")) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/") # start spark nodes self.start_spark(cluster) # start zeppelin, if necessary if zep_instance: self._start_zeppelin(zep_instance) LOG.info(_LI('Cluster has been started successfully')) self._set_cluster_info(cluster)
def _install_services(self, cluster_name, ambari_info): LOG.info(_LI('Installing required Hadoop services ...')) ambari_address = ambari_info.get_address() install_url = ('http://{0}/api/v1/clusters/{' '1}/services?ServiceInfo/state=INIT'.format( ambari_address, cluster_name)) body = ('{"RequestInfo" : { "context" : "Install all services" },' '"Body" : {"ServiceInfo": {"state" : "INSTALLED"}}}') result = self._put(install_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request(self._get_async_request_uri( ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info(_LI("Install of Hadoop stack successful.")) self._finalize_ambari_state(ambari_info) else: LOG.critical(_LC('Install command failed.')) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.')) elif result.status_code != 200: LOG.error( _LE('Install command failed. {0}').format(result.text)) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.'))
def _await_networks(self, cluster, instances): if not instances: return ips_assigned = set() while len(ips_assigned) != len(instances): if not g.check_cluster_exists(cluster): return for instance in instances: if instance.id not in ips_assigned: if networks.init_instances_ips(instance): ips_assigned.add(instance.id) context.sleep(1) LOG.info( _LI("Cluster '%s': all instances have IPs assigned"), cluster.id) cluster = conductor.cluster_get(context.ctx(), cluster) instances = g.get_instances(cluster, ips_assigned) with context.ThreadGroup() as tg: for instance in instances: tg.spawn("wait-for-ssh-%s" % instance.instance_name, self._wait_until_accessible, instance) LOG.info(_LI("Cluster '%s': all instances are accessible"), cluster.id)
def start_services(self, cluster_name, cluster_spec, ambari_info): start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/' 'state=INSTALLED'.format( ambari_info.get_address(), cluster_name)) body = ('{"RequestInfo" : { "context" : "Start all services" },' '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}') self._fire_service_start_notifications( cluster_name, cluster_spec, ambari_info) result = self._put(start_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request( self._get_async_request_uri(ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info( _LI("Successfully started Hadoop cluster.")) LOG.info(_LI('Ambari server address: {server_address}') .format(server_address=ambari_info.get_address())) else: LOG.error(_LE('Failed to start Hadoop cluster.')) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.')) elif result.status_code != 200: LOG.error( _LE('Start command failed. Status: {status}, ' 'response: {response}').format(status=result.status_code, response=result.text)) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.'))
def main(): # TODO(tmckay): Work on restricting the options # pulled in by imports which show up in the help. # If we find a nice way to do this the calls to # unregister_extra_cli_opt() can be removed CONF(project="sahara") # For some reason, this is necessary to clear cached values # and re-read configs. For instance, if this is not done # here the 'plugins' value will not reflect the value from # the config file on the command line CONF.reload_config_files() log.setup(CONF, "sahara") # If we have to enforce extra option checks, like one option # requires another, do it here extra_option_checks() # Since this may be scripted, record the command in the log # so a user can know exactly what was done LOG.info(_LI("Command: {command}").format(command=" ".join(sys.argv))) api.set_logger(LOG) api.set_conf(CONF) CONF.command.func() LOG.info(_LI("Finished {command}").format(command=CONF.command.name))
def start_cluster(self, cluster): nn_instance = utils.get_instance(cluster, "namenode") sm_instance = utils.get_instance(cluster, "master") dn_instances = utils.get_instances(cluster, "datanode") # Start the name node with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") # start the data nodes self._start_slave_datanode_processes(dn_instances) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) with remote.get_remote(nn_instance) as r: r.execute_command("sudo -u hdfs hdfs dfs -mkdir -p /user/$USER/") r.execute_command(("sudo -u hdfs hdfs dfs -chown $USER " "/user/$USER/")) # start spark nodes if sm_instance: with remote.get_remote(sm_instance) as r: run.start_spark_master(r, self._spark_home(cluster)) LOG.info(_LI("Spark service at '%s' has been started"), sm_instance.hostname()) LOG.info(_LI("Cluster %s has been started successfully"), cluster.name) self._set_cluster_info(cluster)
def remove_services(cluster, instances): LOG.info(_LI("Start remove all mapr services")) for instance in instances: with instance.remote() as r: r.execute_command(REMOVE_MAPR_PACKAGES_CMD, run_as_root=True) r.execute_command(REMOVE_MAPR_HOME_CMD, run_as_root=True) r.execute_command(REMOVE_MAPR_CORES_CMD, run_as_root=True) LOG.info(_LI("All mapr services removed"))
def stop_services(cluster, instances): LOG.info(_LI("Stop warden and zookeeper")) for instance in instances: with instance.remote() as r: r.execute_command(STOP_WARDEN_CMD, run_as_root=True) if check_if_is_zookeeper_node(instance): r.execute_command(STOP_ZOOKEEPER_CMD, run_as_root=True) LOG.info(_LI("Warden and zookeeper stoped"))
def exec_configure_sh_on_instance(cluster, instance, script_string): LOG.info(_LI('START: Executing configure.sh')) if check_for_mapr_db(cluster): script_string += ' -M7' if not check_if_mapr_user_exist(instance): script_string += ' --create-user' LOG.debug('script_string = %s', script_string) instance.remote().execute_command(script_string, run_as_root=True) LOG.info(_LI('END: Executing configure.sh'))
def move_node(cluster, instances): LOG.info(_LI("Start moving the node to the /decommissioned")) for instance in instances: with instance.remote() as r: command = GET_SERVER_ID_CMD % instance.management_ip ec, out = r.execute_command(command, run_as_root=True) command = MOVE_NODE_CMD % out.strip() r.execute_command(command, run_as_root=True) LOG.info(_LI("Nodes moved to the /decommissioned"))
def scale_cluster(cluster, instances, disk_setup_script_path, waiting_script, context, configure_sh_string, is_node_awareness): LOG.info(_LI('START: Cluster scaling. Cluster = %s'), cluster.name) for inst in instances: start_helper.install_role_on_instance(inst, context) config.configure_instances(cluster, instances) start_services(cluster, instances, disk_setup_script_path, waiting_script, configure_sh_string) LOG.info(_LI('END: Cluster scaling. Cluster = %s'), cluster)
def format_cluster_deleted_message(cluster): msg = _LI("Cluster %(name)s (id=%(id)s) was deleted. " "Canceling current operation.") if cluster: return (msg, {'name': cluster.name, 'id': cluster.id}) return (msg, {'name': _LI("Unknown"), 'id': _LI("Unknown")})
def install_role_on_instance(instance, cluster_context): LOG.info(_LI('START: Installing roles on node ')) roles_list = instance.node_group.node_processes exec_str = (cluster_context.get_install_manager() + cluster_context.get_roles_str(roles_list)) LOG.debug('Executing "%(command)s" on %(instance)s', {'command': exec_str, 'instance': instance.instance_id}) instance.remote().execute_command(exec_str, run_as_root=True, timeout=900) LOG.info(_LI('END: Installing roles on node '))
def install_roles(cluster, cluster_context): LOG.info(_LI('START: Installing roles on cluster')) instances = utils.get_instances(cluster) with context.ThreadGroup(len(instances)) as tg: for instance in instances: tg.spawn('install_roles_%s' % instance.instance_id, install_role_on_instance, instance, cluster_context) LOG.info(_LI('END: Installing roles on cluster'))
def execute_command(self, cmd, run_as_root=False, get_stderr=False, raise_when_error=True, timeout=300): try: LOG.info(_LI("Issuing command: {cmd}").format(cmd=cmd)) stdout = self.guest.sh(cmd) LOG.info(_LI("Received response: {stdout}").format(stdout=stdout)) return 0, stdout except RuntimeError as ex: if raise_when_error: raise ex else: return 1, ex.message
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: return job_execution engine = _get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': { 'status': edp.JOB_STATUS_TOBEKILLED }}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.exception( _LE("Error during cancel of job execution %(job)s: " "%(error)s"), { 'job': job_execution.id, 'error': ex }) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution %s was canceled successfully"), job_execution.id) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info( _LI("Job execution %(job_exec_id)s was deleted. " "Canceling current operation."), {'job_exec_id': job_execution_id}) return job_execution else: LOG.info( _LI("Job execution status %(job)s: %(status)s"), { 'job': job_execution.id, 'status': job_execution.info['status'] }) return job_execution else: raise e.CancelingFailed( _('Job execution %s was not canceled') % job_execution.id)
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: LOG.info( _LI("Job execution is already finished and shouldn't be" " canceled")) return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: LOG.info(_LI("Can not cancel this job on a non-existant cluster.")) return job_execution engine = get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': { 'status': edp.JOB_STATUS_TOBEKILLED }}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.warning( _LW("Error during cancel of job execution: " "{error}").format(error=ex)) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution was canceled successfully")) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info( _LI("Job execution was deleted. " "Canceling current operation.")) return job_execution else: LOG.info( _LI("Job execution status: {status}").format( status=job_execution.info['status'])) return job_execution else: raise e.CancelingFailed( _('Job execution %s was not canceled') % job_execution.id)
def _install_components(self, ambari_info, auth, cluster_name, servers): # query for the host components on the given hosts that are in the # INIT state # TODO(jspeidel): provide request context body = '{"HostRoles": {"state" : "INSTALLED"}}' install_uri = ('http://{0}/api/v1/clusters/{' '1}/host_components?HostRoles/state=INIT&' 'HostRoles/host_name.in({2})'.format( ambari_info.get_address(), cluster_name, self._get_host_list(servers))) self._exec_ambari_command(ambari_info, body, install_uri) LOG.info(_LI('Started Hadoop components while scaling up')) LOG.info(_LI('Ambari server ip {ip}') .format(ip=ambari_info.get_address()))
def _install_components(self, ambari_info, auth, cluster_name, servers): # query for the host components on the given hosts that are in the # INIT state # TODO(jspeidel): provide request context body = '{"HostRoles": {"state" : "INSTALLED"}}' install_uri = ('http://{0}/api/v1/clusters/{' '1}/host_components?HostRoles/state=INIT&' 'HostRoles/host_name.in({2})'.format( ambari_info.get_address(), cluster_name, self._get_host_list(servers))) self._exec_ambari_command(ambari_info, body, install_uri) LOG.info(_LI('Started Hadoop components while scaling up')) LOG.info( _LI('Ambari server ip {ip}').format(ip=ambari_info.get_address()))
def start_cluster(self, cluster): nn_instance = vu.get_namenode(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") for snn in vu.get_secondarynamenodes(cluster): run.start_processes(remote.get_remote(snn), "secondarynamenode") jt_instance = vu.get_jobtracker(cluster) if jt_instance: run.start_processes(remote.get_remote(jt_instance), "jobtracker") self._start_tt_dn_processes(utils.get_instances(cluster)) self._await_datanodes(cluster) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) oozie = vu.get_oozie(cluster) if oozie: with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at '%s' has been started"), nn_instance.hostname()) hive_server = vu.get_hiveserver(cluster) if hive_server: with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf( r, edp.get_hive_shared_conf_path('hadoop')) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r, cluster.extra['hive_mysql_passwd']) run.hive_metastore_start(r) LOG.info( _LI("Hive Metastore server at %s has been " "started"), hive_server.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def terminate_unneeded_clusters(self, ctx): LOG.debug('Terminating unneeded transient clusters') ctx = context.get_admin_context() context.set_ctx(ctx) for cluster in conductor.cluster_get_all(ctx, status='Active'): if not cluster.is_transient: continue jc = conductor.job_execution_count(ctx, end_time=None, cluster_id=cluster.id) if jc > 0: continue cluster_updated_at = timeutils.normalize_time( timeutils.parse_isotime(cluster.updated_at)) current_time = timeutils.utcnow() spacing = timeutils.delta_seconds(cluster_updated_at, current_time) if spacing < CONF.min_transient_cluster_active_time: continue if CONF.use_identity_api_v3: trusts.use_os_admin_auth_token(cluster) LOG.info( _LI('Terminating transient cluster %(cluster)s ' 'with id %(id)s'), { 'cluster': cluster.name, 'id': cluster.id }) try: ops.terminate_cluster(cluster.id) except Exception as e: LOG.info( _LI('Failed to terminate transient cluster ' '%(cluster)s with id %(id)s: %(error)s.'), { 'cluster': cluster.name, 'id': cluster.id, 'error': six.text_type(e) }) else: if cluster.status != 'AwaitingTermination': conductor.cluster_update( ctx, cluster, {'status': 'AwaitingTermination'}) context.set_ctx(None)
def start_cluster(self, cluster): nn_instance = vu.get_namenode(cluster) with remote.get_remote(nn_instance) as r: run.format_namenode(r) run.start_processes(r, "namenode") for snn in vu.get_secondarynamenodes(cluster): run.start_processes(remote.get_remote(snn), "secondarynamenode") jt_instance = vu.get_jobtracker(cluster) if jt_instance: run.start_processes(remote.get_remote(jt_instance), "jobtracker") self._start_tt_dn_processes(utils.get_instances(cluster)) self._await_datanodes(cluster) LOG.info(_LI("Hadoop services in cluster %s have been started"), cluster.name) oozie = vu.get_oozie(cluster) if oozie: with remote.get_remote(oozie) as r: if c_helper.is_mysql_enable(cluster): run.mysql_start(r, oozie) run.oozie_create_db(r) run.oozie_share_lib(r, nn_instance.hostname()) run.start_oozie(r) LOG.info(_LI("Oozie service at '%s' has been started"), nn_instance.hostname()) hive_server = vu.get_hiveserver(cluster) if hive_server: with remote.get_remote(hive_server) as r: run.hive_create_warehouse_dir(r) run.hive_copy_shared_conf( r, edp.get_hive_shared_conf_path('hadoop')) if c_helper.is_mysql_enable(cluster): if not oozie or hive_server.hostname() != oozie.hostname(): run.mysql_start(r, hive_server) run.hive_create_db(r) run.hive_metastore_start(r) LOG.info(_LI("Hive Metastore server at %s has been " "started"), hive_server.hostname()) LOG.info(_LI('Cluster %s has been started successfully'), cluster.name) self._set_cluster_info(cluster)
def decommission_nodes(cluster, instances, configure_sh_string): LOG.info(_LI('Start decommission . Cluster = %s'), cluster.name) move_node(cluster, instances) stop_services(cluster, instances) context.sleep(names.WAIT_NODE_ALARM_NO_HEARTBEAT) remove_node(cluster, instances) remove_services(cluster, instances) if check_for_cldb_or_zookeeper_service(instances): all_instances = gen.get_instances(cluster) current_cluster_instances = [ x for x in all_instances if x not in instances] for inst in current_cluster_instances: start_helper.exec_configure_sh_on_instance( cluster, inst, configure_sh_string) LOG.info(_LI('End decommission. Cluster = %s'), cluster.name)
def setup(): """Initialise the oslo_messaging layer.""" global TRANSPORT, NOTIFIER messaging.set_transport_defaults('sahara') TRANSPORT = messaging.get_transport(cfg.CONF, aliases=_ALIASES) if not cfg.CONF.oslo_messaging_notifications.enable: LOG.info(_LI("Notifications disabled")) return LOG.info(_LI("Notifications enabled")) serializer = ContextSerializer(JsonPayloadSerializer()) NOTIFIER = messaging.Notifier(TRANSPORT, serializer=serializer)
def create_disk_list_file(instance, path_to_disk_setup_script): LOG.info(_LI('START: Creating disk list file.')) script_path = '/tmp/disk_setup_script.sh' rmt = instance.remote() LOG.debug('Writing /tmp/disk_setup_script.sh') rmt.write_file_to( script_path, files.get_file_text(path_to_disk_setup_script)) LOG.debug('Start executing command: chmod +x %s', script_path) rmt.execute_command('chmod +x ' + script_path, run_as_root=True) LOG.debug('Done for executing command.') args = ' '.join(instance.node_group.storage_paths()) cmd = '%s %s' % (script_path, args) LOG.debug('Executing %s', cmd) rmt.execute_command(cmd, run_as_root=True) LOG.info(_LI('END: Creating disk list file.'))
def setup(): """Initialise the oslo_messaging layer.""" global TRANSPORT, NOTIFIER messaging.set_transport_defaults('sahara') TRANSPORT = messaging.get_transport(cfg.CONF, aliases=_ALIASES) if not cfg.CONF.enable_notifications: LOG.info(_LI("Notifications disabled")) return LOG.info(_LI("Notifications enabled")) serializer = ContextSerializer(JsonPayloadSerializer()) NOTIFIER = messaging.Notifier(TRANSPORT, serializer=serializer)
def cancel_job(job_execution_id): ctx = context.ctx() job_execution = conductor.job_execution_get(ctx, job_execution_id) if job_execution.info['status'] in edp.JOB_STATUSES_TERMINATED: return job_execution cluster = conductor.cluster_get(ctx, job_execution.cluster_id) if cluster is None: return job_execution engine = _get_job_engine(cluster, job_execution) if engine is not None: job_execution = conductor.job_execution_update( ctx, job_execution_id, {'info': {'status': edp.JOB_STATUS_TOBEKILLED}}) timeout = CONF.job_canceling_timeout s_time = timeutils.utcnow() while timeutils.delta_seconds(s_time, timeutils.utcnow()) < timeout: if job_execution.info['status'] not in edp.JOB_STATUSES_TERMINATED: try: job_info = engine.cancel_job(job_execution) except Exception as ex: job_info = None LOG.warning( _LW("Error during cancel of job execution {job}: " "{error}").format(job=job_execution.id, error=ex)) if job_info is not None: job_execution = _write_job_status(job_execution, job_info) LOG.info(_LI("Job execution {job_id} was canceled " "successfully").format( job_id=job_execution.id)) return job_execution context.sleep(3) job_execution = conductor.job_execution_get( ctx, job_execution_id) if not job_execution: LOG.info(_LI("Job execution {job_exec_id} was deleted. " "Canceling current operation.").format( job_exec_id=job_execution_id)) return job_execution else: LOG.info(_LI("Job execution status {job}: {status}").format( job=job_execution.id, status=job_execution.info['status'])) return job_execution else: raise e.CancelingFailed(_('Job execution %s was not canceled') % job_execution.id)
def start_server(app): sock = eventlet.listen((cfg.CONF.host, cfg.CONF.port), backlog=500) if sslutils.is_enabled(): LOG.info(_LI("Using HTTPS for port %s"), cfg.CONF.port) sock = sslutils.wrap(sock) wsgi.server(sock, app, log=loggers.WritableLogger(LOG), debug=False)
def post_start(self, c_context, instances=None): instances = instances or c_context.get_instances() LOG.debug('Executing service post start hooks') for service in c_context.cluster_services: updated = c_context.filter_instances(instances, service=service) service.post_start(c_context, updated) LOG.info(_LI('Post start hooks successfully executed'))
def create_cluster(self, cluster): version = cluster.hadoop_version handler = self.version_factory.get_version_handler(version) cluster_spec = handler.get_cluster_spec( cluster, self._map_to_user_inputs(version, cluster.cluster_configs)) hosts = self._get_servers(cluster) ambari_info = self.get_ambari_info(cluster_spec) self.cluster_ambari_mapping[cluster.name] = ambari_info rpm = self._get_rpm_uri(cluster_spec) servers = [] for host in hosts: host_role = utils.get_host_role(host) servers.append( h.HadoopServer(host, cluster_spec.node_groups[host_role], ambari_rpm=rpm)) self._provision_cluster(cluster.name, cluster_spec, ambari_info, servers, cluster.hadoop_version) # add the topology data file and script if rack awareness is # enabled self._configure_topology_for_cluster(cluster, servers) LOG.info(_LI("Install of Hadoop stack successful.")) # add service urls self._set_cluster_info(cluster, cluster_spec) # check if HDFS HA is enabled; set it up if so if cluster_spec.is_hdfs_ha_enabled(cluster): self.configure_hdfs_ha(cluster)
def change_cluster_status(cluster, status, status_description=None): ctx = context.ctx() # Update cluster status. Race conditions with deletion are still possible, # but this reduces probability at least. cluster = conductor.cluster_get(ctx, cluster) if cluster else None if status_description is not None: change_cluster_status_description(cluster, status_description) # 'Deleting' is final and can't be changed if cluster is None or cluster.status == CLUSTER_STATUS_DELETING: return cluster update_dict = {"status": status} cluster = conductor.cluster_update(ctx, cluster, update_dict) conductor.cluster_provision_progress_update(ctx, cluster.id) LOG.info(_LI("Cluster status has been changed. New status=" "{status}").format(status=cluster.status)) sender.notify(ctx, cluster.id, cluster.name, cluster.status, "update") return cluster
def _exec_ambari_command(self, ambari_info, body, cmd_uri): LOG.debug('PUT URI: {0}'.format(cmd_uri)) result = self._put(cmd_uri, ambari_info, data=body) if result.status_code == 202: LOG.debug( 'PUT response: {0}'.format(result.text)) json_result = json.loads(result.text) href = json_result['href'] + '/tasks?fields=Tasks/status' success = self._wait_for_async_request(href, ambari_info) if success: LOG.info( _LI("Successfully changed state of Hadoop components ")) else: LOG.critical(_LC('Failed to change state of Hadoop ' 'components')) raise ex.HadoopProvisionError( _('Failed to change state of Hadoop components')) else: LOG.error( _LE('Command failed. Status: %(status)s, response: ' '%(response)s'), {'status': result.status_code, 'response': result.text}) raise ex.HadoopProvisionError(_('Hadoop/Ambari command failed.'))
def _single_run(self, application, sock): """Start a WSGI server in a new green thread.""" LOG.info(_LI("Starting single process server")) eventlet.wsgi.server(sock, application, custom_pool=self.pool, log=loggers.WritableLogger(LOG), debug=False)
def _provision_cluster(self, name, cluster_spec, ambari_info, servers, version): # TODO(jspeidel): encapsulate in another class if servers: cpo.add_provisioning_step( servers[0].cluster_id, _("Provision cluster via Ambari"), len(servers)) with context.ThreadGroup() as tg: for server in servers: with context.set_current_instance_id( server.instance['instance_id']): tg.spawn( "hdp-provision-instance-%s" % server.instance.hostname(), server.provision_ambari, ambari_info, cluster_spec) handler = self.version_factory.get_version_handler(version) ambari_client = handler.get_ambari_client() ambari_client.wait_for_host_registrations(len(servers), ambari_info) self._set_ambari_credentials(cluster_spec, ambari_info, version) ambari_client.provision_cluster( cluster_spec, servers, ambari_info, name) LOG.info(_LI('Cluster provisioned via Ambari Server: {server_ip}') .format(server_ip=ambari_info.get_address()))
def create_cluster(self, cluster): version = cluster.hadoop_version handler = self.version_factory.get_version_handler(version) cluster_spec = handler.get_cluster_spec( cluster, self._map_to_user_inputs( version, cluster.cluster_configs)) hosts = self._get_servers(cluster) ambari_info = self.get_ambari_info(cluster_spec) self.cluster_ambari_mapping[cluster.name] = ambari_info rpm = self._get_rpm_uri(cluster_spec) servers = [] for host in hosts: host_role = utils.get_host_role(host) servers.append( h.HadoopServer(host, cluster_spec.node_groups[host_role], ambari_rpm=rpm)) self._provision_cluster( cluster.name, cluster_spec, ambari_info, servers, cluster.hadoop_version) # add the topology data file and script if rack awareness is # enabled self._configure_topology_for_cluster(cluster, servers) LOG.info(_LI("Install of Hadoop stack successful.")) # add service urls self._set_cluster_info(cluster, cluster_spec)
def configure_hdfs_ha(self, cluster): LOG.debug("Configuring HDFS HA") version = cluster.hadoop_version handler = self.version_factory.get_version_handler(version) cluster_spec = handler.get_cluster_spec( cluster, self._map_to_user_inputs(version, cluster.cluster_configs)) hosts = self._get_servers(cluster) ambari_info = self.get_ambari_info(cluster_spec) self.cluster_ambari_mapping[cluster.name] = ambari_info rpm = self._get_rpm_uri(cluster_spec) servers = [] for host in hosts: host_role = utils.get_host_role(host) servers.append( h.HadoopServer(host, cluster_spec.node_groups[host_role], ambari_rpm=rpm)) ambari_client = handler.get_ambari_client() ambari_client.setup_hdfs_ha(cluster_spec, servers, ambari_info, cluster.name) LOG.info(_LI("Configure HDFS HA successful."))
def vm_awareness_mapred_config(): c = x.load_hadoop_xml_defaults('topology/resources/mapred-template.xml') result = [cfg for cfg in c if cfg['value']] LOG.info( _LI("Vm awareness will add following configs in map-red " "params: %s"), result) return result
def change_cluster_status(cluster, status, status_description=None): ctx = context.ctx() # Update cluster status. Race conditions with deletion are still possible, # but this reduces probability at least. cluster = conductor.cluster_get(ctx, cluster) if cluster else None # 'Deleting' is final and can't be changed if cluster is None or cluster.status == 'Deleting': return cluster update_dict = {"status": status} if status_description: update_dict["status_description"] = status_description cluster = conductor.cluster_update(ctx, cluster, update_dict) LOG.info( _LI("Cluster status has been changed: id=%(id)s, New status=" "%(status)s"), { 'id': cluster.id, 'status': cluster.status }) sender.notify(ctx, cluster.id, cluster.name, cluster.status, "update") return cluster
def _rollback_cluster_creation(self, cluster, ex): """Shutdown all instances and update cluster status.""" LOG.info(_LI("Cluster '%(name)s' creation rollback " "(reason: %(reason)s)"), {'name': cluster.name, 'reason': ex}) self.shutdown_cluster(cluster)