def _install_services(self, cluster_name, ambari_info): ambari_address = ambari_info.get_address() install_url = ('http://{0}/api/v1/clusters/{' '1}/services?ServiceInfo/state=INIT'.format( ambari_address, cluster_name)) body = ('{"RequestInfo" : { "context" : "Install all services" },' '"Body" : {"ServiceInfo": {"state" : "INSTALLED"}}}') result = self._put(install_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request( self._get_async_request_uri(ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info(_LI("Hadoop stack installed successfully.")) self._finalize_ambari_state(ambari_info) else: LOG.error(_LE('Install command failed.')) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.')) elif result.status_code != 200: LOG.error( _LE('Install command failed. {result}').format( result=result.text)) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.'))
def _install_services(self, cluster_name, ambari_info): ambari_address = ambari_info.get_address() install_url = ('http://{0}/api/v1/clusters/{' '1}/services?ServiceInfo/state=INIT'.format( ambari_address, cluster_name)) body = ('{"RequestInfo" : { "context" : "Install all services" },' '"Body" : {"ServiceInfo": {"state" : "INSTALLED"}}}') result = self._put(install_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request(self._get_async_request_uri( ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info(_LI("Hadoop stack installed successfully.")) self._finalize_ambari_state(ambari_info) else: LOG.error(_LE('Install command failed.')) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.')) elif result.status_code != 200: LOG.error( _LE('Install command failed. {result}').format( result=result.text)) raise ex.HadoopProvisionError( _('Installation of Hadoop stack failed.'))
def start_services(self, cluster_name, cluster_spec, ambari_info): start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/' 'state=INSTALLED'.format(ambari_info.get_address(), cluster_name)) body = ('{"RequestInfo" : { "context" : "Start all services" },' '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}') self._fire_service_start_notifications(cluster_name, cluster_spec, ambari_info) result = self._put(start_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request( self._get_async_request_uri(ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info(_LI("Successfully started Hadoop cluster.")) LOG.info( _LI('Ambari server address: {server_address}').format( server_address=ambari_info.get_address())) else: LOG.error(_LE('Failed to start Hadoop cluster.')) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.')) elif result.status_code != 200: LOG.error( _LE('Start command failed. Status: {status}, ' 'response: {response}').format(status=result.status_code, response=result.text)) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.'))
def start_services(self, cluster_name, cluster_spec, ambari_info): start_url = ('http://{0}/api/v1/clusters/{1}/services?ServiceInfo/' 'state=INSTALLED'.format( ambari_info.get_address(), cluster_name)) body = ('{"RequestInfo" : { "context" : "Start all services" },' '"Body" : {"ServiceInfo": {"state" : "STARTED"}}}') self._fire_service_start_notifications( cluster_name, cluster_spec, ambari_info) result = self._put(start_url, ambari_info, data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request( self._get_async_request_uri(ambari_info, cluster_name, request_id), ambari_info) if success: LOG.info( _LI("Successfully started Hadoop cluster.")) LOG.info(_LI('Ambari server address: {server_address}') .format(server_address=ambari_info.get_address())) else: LOG.error(_LE('Failed to start Hadoop cluster.')) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.')) elif result.status_code != 200: LOG.error( _LE('Start command failed. Status: {status}, ' 'response: {response}').format(status=result.status_code, response=result.text)) raise ex.HadoopProvisionError( _('Start of Hadoop services failed.'))
def _provision_cluster(cluster_id): ctx, cluster, plugin = _prepare_provisioning(cluster_id) if CONF.use_identity_api_v3 and cluster.is_transient: trusts.create_trust_for_cluster(cluster) # updating cluster infra cluster = g.change_cluster_status(cluster, "InfraUpdating") plugin.update_infra(cluster) # creating instances and configuring them cluster = conductor.cluster_get(ctx, cluster_id) INFRA.create_cluster(cluster) if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return # configure cluster cluster = g.change_cluster_status(cluster, "Configuring") try: plugin.configure_cluster(cluster) except Exception as ex: if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return LOG.exception( _LE("Can't configure cluster '%(name)s' (reason: %(reason)s)"), {'name': cluster.name, 'reason': ex}) g.change_cluster_status(cluster, "Error") return if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return # starting prepared and configured cluster cluster = g.change_cluster_status(cluster, "Starting") try: plugin.start_cluster(cluster) except Exception as ex: if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return LOG.exception( _LE("Can't start services for cluster '%(name)s' (reason: " "%(reason)s)"), {'name': cluster.name, 'reason': ex}) g.change_cluster_status(cluster, "Error") return if not g.check_cluster_exists(cluster): LOG.info(g.format_cluster_deleted_message(cluster)) return # cluster is now up and ready cluster = g.change_cluster_status(cluster, "Active") # schedule execution pending job for cluster for je in conductor.job_execution_get_all(ctx, cluster_id=cluster.id): job_manager.run_job(je.id)
def _add_hosts_and_components( self, cluster_spec, servers, ambari_info, name): add_host_url = 'http://{0}/api/v1/clusters/{1}/hosts/{2}' add_host_component_url = ('http://{0}/api/v1/clusters/{1}' '/hosts/{2}/host_components/{3}') for host in servers: hostname = host.instance.fqdn().lower() result = self._post( add_host_url.format(ambari_info.get_address(), name, hostname), ambari_info) if result.status_code != 201: LOG.error( _LE('Create host command failed. {0}').format(result.text)) raise ex.HadoopProvisionError( _('Failed to add host: %s') % result.text) node_group_name = host.node_group.name # TODO(jspeidel): ensure that node group exists node_group = cluster_spec.node_groups[node_group_name] for component in node_group.components: # don't add any AMBARI components if component.find('AMBARI') != 0: result = self._post(add_host_component_url.format( ambari_info.get_address(), name, hostname, component), ambari_info) if result.status_code != 201: LOG.error( _LE('Create host_component command failed. %s'), result.text) raise ex.HadoopProvisionError( _('Failed to add host component: %s') % result.text)
def wrapper(cluster_id, *args, **kwds): ctx = context.ctx() try: # Clearing status description before executing c_u.change_cluster_status_description(cluster_id, "") f(cluster_id, *args, **kwds) except Exception as ex: # something happened during cluster operation cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been removed) if (cluster is None or cluster.status == c_u.CLUSTER_STATUS_DELETING): LOG.debug("Cluster was deleted or marked for deletion. " "Canceling current operation.") return msg = six.text_type(ex) LOG.exception( _LE("Error during operating on cluster (reason: " "{reason})").format(reason=msg)) try: # trying to rollback desc = description.format(reason=msg) if _rollback_cluster(cluster, ex): c_u.change_cluster_status(cluster, c_u.CLUSTER_STATUS_ACTIVE, desc) else: c_u.change_cluster_status(cluster, c_u.CLUSTER_STATUS_ERROR, desc) except Exception as rex: cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been # removed during rollback) if (cluster is None or cluster.status == c_u.CLUSTER_STATUS_DELETING): LOG.debug("Cluster was deleted or marked for deletion." " Canceling current operation.") return LOG.exception( _LE("Error during rollback of cluster (reason:" " {reason})").format(reason=six.text_type(rex))) desc = "{0}, {1}".format(msg, six.text_type(rex)) c_u.change_cluster_status(cluster, c_u.CLUSTER_STATUS_ERROR, description.format(reason=desc))
def _delete_job_execution(job_execution_id): try: _cancel_job_execution(job_execution_id) except exceptions.CancelingFailed: LOG.error(_LE("Job execution %s can't be cancelled in time. " "Deleting it anyway."), job_execution_id) conductor.job_execution_destroy(context.ctx(), job_execution_id)
def _migrate_up(self, engine, version, with_data=False): """migrate up to a new version of the db. We allow for data insertion and post checks at every migration version with special _pre_upgrade_### and _check_### functions in the main test. """ # NOTE(sdague): try block is here because it's impossible to debug # where a failed data migration happens otherwise check_version = version try: if with_data: data = None pre_upgrade = getattr(self, "_pre_upgrade_%s" % check_version, None) if pre_upgrade: data = pre_upgrade(engine) self._migrate(engine, version, 'upgrade') self.assertEqual(version, self._get_version_from_db(engine)) if with_data: check = getattr(self, "_check_%s" % check_version, None) if check: check(engine, data) except Exception: LOG.error( _LE("Failed to migrate to version {version} on engine " "{engine}").format(version=version, engine=engine)) raise
def create_trust(trustor, trustee, role_names, impersonation=True, project_id=None, expires=True): """Create a trust and return it's identifier :param trustor: The Keystone client delegating the trust. :param trustee: The Keystone client consuming the trust. :param role_names: A list of role names to be assigned. :param impersonation: Should the trustee impersonate trustor, default is True. :param project_id: The project that the trust will be scoped into, default is the trustor's project id. :param expires: The trust will expire if this is set to True. :returns: A valid trust id. :raises CreationFailed: If the trust cannot be created. """ if project_id is None: project_id = trustor.tenant_id try: expires_at = _get_expiry() if expires else None trust = trustor.trusts.create( trustor_user=trustor.user_id, trustee_user=trustee.user_id, impersonation=impersonation, role_names=role_names, project=project_id, expires_at=expires_at, ) LOG.debug("Created trust {trust_id}".format(trust_id=six.text_type(trust.id))) return trust.id except Exception as e: LOG.error(_LE("Unable to create trust (reason: {reason})").format(reason=e)) raise ex.CreationFailed(_("Failed to create trust"))
def _delete_volume(volume_id): LOG.debug("Deleting volume {volume}".format(volume=volume_id)) volume = cinder.get_volume(volume_id) try: b.execute_with_retries(volume.delete) except Exception: LOG.error(_LE("Can't delete volume {volume}").format(volume=volume.id))
def _delete_volume(volume_id): LOG.debug("Deleting volume %s" % volume_id) volume = cinder.get_volume(volume_id) try: volume.delete() except Exception: LOG.exception(_LE("Can't delete volume %s"), volume.id)
def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % ( volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn(_LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), {'volume': volume_id, 'status': volume.status})
def execute_job(job_id, data): # Elements common to all job types cluster_id = data['cluster_id'] configs = data.get('job_configs', {}) # Not in Java job types but present for all others input_id = data.get('input_id', None) output_id = data.get('output_id', None) # Since we will use a unified class in the database, we pass # a superset for all job types job_ex_dict = {'input_id': input_id, 'output_id': output_id, 'job_id': job_id, 'cluster_id': cluster_id, 'info': {'status': edp.JOB_STATUS_PENDING}, 'job_configs': configs, 'extra': {}} job_execution = conductor.job_execution_create(context.ctx(), job_ex_dict) # check to use proxy user if p.job_execution_requires_proxy_user(job_execution): try: p.create_proxy_user_for_job_execution(job_execution) except ex.SaharaException as e: LOG.exception(_LE("Can't run job execution '{0}' " "(reasons: {1})").format(job_execution.id, e)) conductor.job_execution_destroy(context.ctx(), job_execution) raise e OPS.run_edp_job(job_execution.id) return job_execution
def _delete_volume(volume_id): LOG.debug("Deleting volume {volume}".format(volume=volume_id)) volume = cinder.get_volume(volume_id) try: volume.delete() except Exception: LOG.error(_LE("Can't delete volume {volume}").format(volume=volume.id))
def update_job_status(job_execution_id): try: get_job_status(job_execution_id) except Exception as e: LOG.exception( _LE("Error during update job execution {job}: {error}").format( job=job_execution_id, error=e))
def _build_proxy_command(self, command, instance=None, port=None, info=None, rootwrap_command=None): # Accepted keywords in the proxy command template: # {host}, {port}, {tenant_id}, {network_id}, {router_id} keywords = {} if not info: info = self.get_neutron_info(instance) keywords['tenant_id'] = context.current().tenant_id keywords['network_id'] = info['network'] # Query Neutron only if needed if '{router_id}' in command: client = neutron.NeutronClient(info['network'], info['token'], info['tenant']) keywords['router_id'] = client.get_router() keywords['host'] = instance.management_ip keywords['port'] = port try: command = command.format(**keywords) except KeyError as e: LOG.error(_LE('Invalid keyword in proxy_command: {result}').format( result=e)) # Do not give more details to the end-user raise ex.SystemError('Misconfiguration') if rootwrap_command: command = '{0} {1}'.format(rootwrap_command, command) return command
def _hdfs_ha_update_host_component(self, hac, host, component, state): update_host_component_url = ('http://{0}/api/v1/clusters/{1}' '/hosts/{2}/host_components/{3}').format( hac['ambari_info'].get_address(), hac['name'], host, component) component_state = {"HostRoles": {"state": state}} body = json.dumps(component_state) result = self._put(update_host_component_url, hac['ambari_info'], data=body) if result.status_code == 202: json_result = json.loads(result.text) request_id = json_result['Requests']['id'] success = self._wait_for_async_request(self._get_async_request_uri( hac['ambari_info'], hac['name'], request_id), hac['ambari_info']) if success: LOG.info(_LI("HDFS-HA: Host component updated successfully: " "{0} {1}").format(host, component)) else: LOG.critical(_LC("HDFS-HA: Host component update failed: " "{0} {1}").format(host, component)) raise ex.NameNodeHAConfigurationError( 'Configuring HDFS HA failed. %s' % result.text) elif result.status_code != 200: LOG.error( _LE('Configuring HDFS HA failed. {0}').format(result.text)) raise ex.NameNodeHAConfigurationError( 'Configuring HDFS HA failed. %s' % result.text)
def _detach_volume(instance, volume_id): volume = cinder.get_volume(volume_id) try: LOG.debug("Detaching volume %s from instance %s" % (volume_id, instance.instance_name)) nova.client().volumes.delete_server_volume(instance.instance_id, volume_id) except Exception: LOG.exception(_LE("Can't detach volume %s"), volume.id) detach_timeout = CONF.detach_volume_timeout LOG.debug("Waiting %d seconds to detach %s volume" % (detach_timeout, volume_id)) s_time = tu.utcnow() while tu.delta_seconds(s_time, tu.utcnow()) < detach_timeout: volume = cinder.get_volume(volume_id) if volume.status not in ['available', 'error']: context.sleep(2) else: LOG.debug("Volume %s has been detached" % volume_id) return else: LOG.warn( _LW("Can't detach volume %(volume)s. " "Current status of volume: %(status)s"), { 'volume': volume_id, 'status': volume.status })
def execute_job(job_id, data): # Elements common to all job types cluster_id = data['cluster_id'] configs = data.get('job_configs', {}) interface = data.get('interface', {}) # Not in Java job types but present for all others input_id = data.get('input_id', None) output_id = data.get('output_id', None) # Since we will use a unified class in the database, we pass # a superset for all job types job_ex_dict = {'input_id': input_id, 'output_id': output_id, 'job_id': job_id, 'cluster_id': cluster_id, 'info': {'status': edp.JOB_STATUS_PENDING}, 'job_configs': configs, 'extra': {}, 'interface': interface} job_execution = conductor.job_execution_create(context.ctx(), job_ex_dict) context.set_current_job_execution_id(job_execution.id) # check to use proxy user if p.job_execution_requires_proxy_user(job_execution): try: p.create_proxy_user_for_job_execution(job_execution) except ex.SaharaException as e: LOG.error(_LE("Can't run job execution. " "(Reasons: {reason})").format(reason=e)) conductor.job_execution_destroy(context.ctx(), job_execution) raise e OPS.run_edp_job(job_execution.id) return job_execution
def invoke(self, method, relpath=None, params=None, data=None, headers=None): """Invoke an API method :return: Raw body or JSON dictionary (if response content type is JSON). """ path = self._join_uri(relpath) resp = self._client.execute(method, path, params=params, data=data, headers=headers) try: body = resp.read() except Exception as ex: raise ex.CMApiException( _("Command %(method)s %(path)s failed: %(msg)s") % {'method': method, 'path': path, 'msg': six.text_type(ex)}) LOG.debug("{method} got response: {body}".format(method=method, body=body[:32])) # Is the response application/json? if (len(body) != 0 and resp.info().getmaintype() == "application" and resp.info().getsubtype() == "json"): try: json_dict = json.loads(body) return json_dict except Exception as ex: LOG.error(_LE('JSON decode error: {body}').format(body=body)) raise ex else: return body
def _migrate_up(self, engine, version, with_data=False): """migrate up to a new version of the db. We allow for data insertion and post checks at every migration version with special _pre_upgrade_### and _check_### functions in the main test. """ # NOTE(sdague): try block is here because it's impossible to debug # where a failed data migration happens otherwise check_version = version try: if with_data: data = None pre_upgrade = getattr( self, "_pre_upgrade_%s" % check_version, None) if pre_upgrade: data = pre_upgrade(engine) self._migrate(engine, version, 'upgrade') self.assertEqual(version, self._get_version_from_db(engine)) if with_data: check = getattr(self, "_check_%s" % check_version, None) if check: check(engine, data) except Exception: LOG.error(_LE("Failed to migrate to version {version} on engine " "{engine}").format(version=version, engine=engine)) raise
def _build_proxy_command(self, command, instance=None, port=None, info=None, rootwrap_command=None): # Accepted keywords in the proxy command template: # {host}, {port}, {tenant_id}, {network_id}, {router_id} keywords = {} if not info: info = self.get_neutron_info(instance) keywords['tenant_id'] = context.current().tenant_id keywords['network_id'] = info['network'] # Query Neutron only if needed if '{router_id}' in command: client = neutron.NeutronClient(info['network'], info['uri'], info['token'], info['tenant']) keywords['router_id'] = client.get_router() keywords['host'] = instance.management_ip keywords['port'] = port try: command = command.format(**keywords) except KeyError as e: LOG.error( _LE('Invalid keyword in proxy_command: {result}').format( result=e)) # Do not give more details to the end-user raise ex.SystemError('Misconfiguration') if rootwrap_command: command = '{0} {1}'.format(rootwrap_command, command) return command
def create_trust(trustor, trustee, role_names, impersonation=True, project_id=None): '''Create a trust and return it's identifier :param trustor: The Keystone client delegating the trust. :param trustee: The Keystone client consuming the trust. :param role_names: A list of role names to be assigned. :param impersonation: Should the trustee impersonate trustor, default is True. :param project_id: The project that the trust will be scoped into, default is the trustor's project id. :returns: A valid trust id. :raises CreationFailed: If the trust cannot be created. ''' if project_id is None: project_id = trustor.tenant_id try: trust = trustor.trusts.create(trustor_user=trustor.user_id, trustee_user=trustee.user_id, impersonation=impersonation, role_names=role_names, project=project_id) LOG.debug('Created trust {0}'.format(six.text_type(trust.id))) return trust.id except Exception as e: LOG.exception(_LE('Unable to create trust (reason: %s)'), e) raise ex.CreationFailed(_('Failed to create trust'))
def _exec_ambari_command(self, ambari_info, body, cmd_uri): LOG.debug('PUT URI: {0}'.format(cmd_uri)) result = self._put(cmd_uri, ambari_info, data=body) if result.status_code == 202: LOG.debug( 'PUT response: {0}'.format(result.text)) json_result = json.loads(result.text) href = json_result['href'] + '/tasks?fields=Tasks/status' success = self._wait_for_async_request(href, ambari_info) if success: LOG.info( _LI("Successfully changed state of Hadoop components ")) else: LOG.critical(_LC('Failed to change state of Hadoop ' 'components')) raise ex.HadoopProvisionError( _('Failed to change state of Hadoop components')) else: LOG.error( _LE('Command failed. Status: %(status)s, response: ' '%(response)s'), {'status': result.status_code, 'response': result.text}) raise ex.HadoopProvisionError(_('Hadoop/Ambari command failed.'))
def execute_with_retries(method, *args, **kwargs): attempts = CONF.retries.retries_number + 1 while attempts > 0: try: return method(*args, **kwargs) except Exception as e: error_code = getattr(e, 'http_status', None) or getattr( e, 'status_code', None) or getattr(e, 'code', None) if error_code in ERRORS_TO_RETRY: LOG.warning(_LW('Occasional error occured during "{method}" ' 'execution: {error_msg} ({error_code}). ' 'Operation will be retried.').format( method=method.__name__, error_msg=e, error_code=error_code)) attempts -= 1 retry_after = getattr(e, 'retry_after', 0) context.sleep(max(retry_after, CONF.retries.retry_after)) else: LOG.error(_LE('Permanent error occured during "{method}" ' 'execution: {error_msg}.').format( method=method.__name__, error_msg=e)) raise e else: raise ex.MaxRetriesExceeded(attempts, method.__name__)
def wrapper(cluster_id, *args, **kwds): ctx = context.ctx() try: # Clearing status description before executing g.change_cluster_status_description(cluster_id, "") f(cluster_id, *args, **kwds) except Exception as ex: # something happened during cluster operation cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been removed) if cluster is None or cluster.status == 'Deleting': LOG.debug( "Cluster id={id} was deleted or marked for " "deletion. Canceling current operation.".format( id=cluster_id)) return msg = six.text_type(ex) LOG.error( _LE("Error during operating on cluster {name} (reason: " "{reason})").format(name=cluster.name, reason=msg)) try: # trying to rollback desc = description.format(reason=msg) if _rollback_cluster(cluster, ex): g.change_cluster_status(cluster, "Active", desc) else: g.change_cluster_status(cluster, "Error", desc) except Exception as rex: cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been # removed during rollback) if cluster is None or cluster.status == 'Deleting': LOG.debug( "Cluster id={id} was deleted or marked for " "deletion. Canceling current operation." .format(id=cluster_id)) return LOG.error( _LE("Error during rollback of cluster {name} (reason:" " {reason})").format(name=cluster.name, reason=six.text_type(rex))) desc = "{0}, {1}".format(msg, six.text_type(rex)) g.change_cluster_status( cluster, "Error", description.format(reason=desc))
def wrapper(cluster_id, *args, **kwds): ctx = context.ctx() try: # Clearing status description before executing c_u.change_cluster_status_description(cluster_id, "") f(cluster_id, *args, **kwds) except Exception as ex: # something happened during cluster operation cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been removed) if (cluster is None or cluster.status == c_u.CLUSTER_STATUS_DELETING): LOG.debug("Cluster was deleted or marked for deletion. " "Canceling current operation.") return msg = six.text_type(ex) LOG.exception(_LE("Error during operating on cluster (reason: " "{reason})").format(reason=msg)) try: # trying to rollback desc = description.format(reason=msg) if _rollback_cluster(cluster, ex): c_u.change_cluster_status( cluster, c_u.CLUSTER_STATUS_ACTIVE, desc) else: c_u.change_cluster_status( cluster, c_u.CLUSTER_STATUS_ERROR, desc) except Exception as rex: cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been # removed during rollback) if (cluster is None or cluster.status == c_u.CLUSTER_STATUS_DELETING): LOG.debug("Cluster was deleted or marked for deletion." " Canceling current operation.") return LOG.exception( _LE("Error during rollback of cluster (reason:" " {reason})").format(reason=six.text_type(rex))) desc = "{0}, {1}".format(msg, six.text_type(rex)) c_u.change_cluster_status( cluster, c_u.CLUSTER_STATUS_ERROR, description.format(reason=desc))
def setup_db(): try: engine = get_engine() m.Cluster.metadata.create_all(engine) except sa.exc.OperationalError as e: LOG.exception(_LE("Database registration exception: %s"), e) return False return True
def drop_db(): try: engine = get_engine() m.Cluster.metadata.drop_all(engine) except Exception as e: LOG.exception(_LE("Database shutdown exception: %s"), e) return False return True
def update_job_statuses(): ctx = context.ctx() for je in conductor.job_execution_get_all(ctx, end_time=None): try: get_job_status(je.id) except Exception as e: LOG.error(_LE("Error during update job execution {job}: {error}") .format(job=je.id, error=e))
def _delete_volume(volume_id): LOG.debug("Deleting volume {volume}".format(volume=volume_id)) volume = cinder.get_volume(volume_id) try: volume.delete() except Exception: LOG.error(_LE("Can't delete volume {volume}").format( volume=volume.id))
def validate_cluster_creating(cluster): if not cmu.have_cm_api_libs(): LOG.error( _LE("For provisioning cluster with CDH plugin install" "'cm_api' package version 6.0.2 or later.")) raise ex.HadoopProvisionError(_("'cm_api' is not installed.")) mng_count = _get_inst_count(cluster, 'MANAGER') if mng_count != 1: raise ex.InvalidComponentCountException('MANAGER', 1, mng_count) nn_count = _get_inst_count(cluster, 'NAMENODE') if nn_count != 1: raise ex.InvalidComponentCountException('NAMENODE', 1, nn_count) snn_count = _get_inst_count(cluster, 'SECONDARYNAMENODE') if snn_count != 1: raise ex.InvalidComponentCountException('SECONDARYNAMENODE', 1, snn_count) rm_count = _get_inst_count(cluster, 'RESOURCEMANAGER') if rm_count not in [0, 1]: raise ex.InvalidComponentCountException('RESOURCEMANAGER', '0 or 1', rm_count) hs_count = _get_inst_count(cluster, 'JOBHISTORY') if hs_count not in [0, 1]: raise ex.InvalidComponentCountException('JOBHISTORY', '0 or 1', hs_count) if rm_count > 0 and hs_count < 1: raise ex.RequiredServiceMissingException('JOBHISTORY', required_by='RESOURCEMANAGER') nm_count = _get_inst_count(cluster, 'NODEMANAGER') if rm_count == 0: if nm_count > 0: raise ex.RequiredServiceMissingException('RESOURCEMANAGER', required_by='NODEMANAGER') oo_count = _get_inst_count(cluster, 'OOZIE_SERVER') dn_count = _get_inst_count(cluster, 'DATANODE') if oo_count not in [0, 1]: raise ex.InvalidComponentCountException('OOZIE_SERVER', '0 or 1', oo_count) if oo_count == 1: if dn_count < 1: raise ex.RequiredServiceMissingException( 'DATANODE', required_by='OOZIE_SERVER') if nm_count < 1: raise ex.RequiredServiceMissingException( 'NODEMANAGER', required_by='OOZIE_SERVER') if hs_count != 1: raise ex.RequiredServiceMissingException( 'JOBHISTORY', required_by='OOZIE_SERVER')
def wrapper(cluster_id, *args, **kwds): try: f(cluster_id, *args, **kwds) except Exception as ex: # something happened during cluster operation ctx = context.ctx() cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been removed) if cluster is None or cluster.status == 'Deleting': LOG.info( _LI("Cluster %s was deleted or marked for " "deletion. Canceling current operation."), cluster_id) return LOG.exception( _LE("Error during operating cluster '%(name)s' (reason: " "%(reason)s)"), { 'name': cluster.name, 'reason': ex }) try: # trying to rollback if _rollback_cluster(cluster, ex): g.change_cluster_status(cluster, "Active") else: g.change_cluster_status(cluster, "Error") except Exception as rex: cluster = conductor.cluster_get(ctx, cluster_id) # check if cluster still exists (it might have been # removed during rollback) if cluster is None: LOG.info( _LI("Cluster with %s was deleted. Canceling " "current operation."), cluster_id) return LOG.exception( _LE("Error during rollback of cluster '%(name)s' (reason: " "%(reason)s)"), { 'name': cluster.name, 'reason': rex }) g.change_cluster_status(cluster, "Error")
def update_job_statuses(): ctx = context.ctx() for je in conductor.job_execution_get_all(ctx, end_time=None): try: get_job_status(je.id) except Exception as e: LOG.exception( _LE("Error during update job execution %(job)s: %(error)s"), {'job': je.id, 'error': e})
def validate_cluster_creating(cluster): if not cmu.have_cm_api_libs(): LOG.error(_LE("For provisioning cluster with CDH plugin install" "'cm_api' package version 6.0.2 or later.")) raise ex.HadoopProvisionError(_("'cm_api' is not installed.")) mng_count = _get_inst_count(cluster, 'MANAGER') if mng_count != 1: raise ex.InvalidComponentCountException('MANAGER', 1, mng_count) nn_count = _get_inst_count(cluster, 'NAMENODE') if nn_count != 1: raise ex.InvalidComponentCountException('NAMENODE', 1, nn_count) snn_count = _get_inst_count(cluster, 'SECONDARYNAMENODE') if snn_count != 1: raise ex.InvalidComponentCountException('SECONDARYNAMENODE', 1, snn_count) rm_count = _get_inst_count(cluster, 'RESOURCEMANAGER') if rm_count not in [0, 1]: raise ex.InvalidComponentCountException('RESOURCEMANAGER', '0 or 1', rm_count) hs_count = _get_inst_count(cluster, 'JOBHISTORY') if hs_count not in [0, 1]: raise ex.InvalidComponentCountException('JOBHISTORY', '0 or 1', hs_count) if rm_count > 0 and hs_count < 1: raise ex.RequiredServiceMissingException('JOBHISTORY', required_by='RESOURCEMANAGER') nm_count = _get_inst_count(cluster, 'NODEMANAGER') if rm_count == 0: if nm_count > 0: raise ex.RequiredServiceMissingException('RESOURCEMANAGER', required_by='NODEMANAGER') oo_count = _get_inst_count(cluster, 'OOZIE_SERVER') dn_count = _get_inst_count(cluster, 'DATANODE') if oo_count not in [0, 1]: raise ex.InvalidComponentCountException('OOZIE_SERVER', '0 or 1', oo_count) if oo_count == 1: if dn_count < 1: raise ex.RequiredServiceMissingException( 'DATANODE', required_by='OOZIE_SERVER') if nm_count < 1: raise ex.RequiredServiceMissingException( 'NODEMANAGER', required_by='OOZIE_SERVER') if hs_count != 1: raise ex.RequiredServiceMissingException( 'JOBHISTORY', required_by='OOZIE_SERVER')
def abort_and_log(status_code, descr, exc=None): LOG.error(_LE("Request aborted with status code %(code)s and " "message '%(message)s'"), {'code': status_code, 'message': descr}) if exc is not None: LOG.error(traceback.format_exc()) flask.abort(status_code, description=descr)
def _instance_get_data(self, instance, lock): try: with instance.remote() as r: data = self._get_resolv_conf(r) except Exception: data = None LOG.exception(_LE("Couldn't read '/etc/resolv.conf'")) with lock: self._data[instance.get_ip_or_dns_name()] = data
def abort_and_log(status_code, descr, exc=None): LOG.error(_LE("Request aborted with status code {code} and " "message '{message}'").format(code=status_code, message=descr)) if exc is not None: LOG.error(traceback.format_exc()) flask.abort(status_code, description=descr)
def check_health(self): inst_ips_or_names = self.provider.get_accessibility_data() if inst_ips_or_names: insts = ', '.join(inst_ips_or_names) LOG.exception( _LE("Instances (%s) are not available in the cluster") % insts) raise RedHealthError( _("Instances (%s) are not available in the cluster.") % insts) return _("All instances are available")
def hup(*args): """Shuts down the server(s). Shuts down the server(s), but allows running requests to complete """ LOG.error(_LE('SIGHUP received')) signal.signal(signal.SIGHUP, signal.SIG_IGN) os.killpg(0, signal.SIGHUP) signal.signal(signal.SIGHUP, hup)
def update_job_statuses(): ctx = context.ctx() for je in conductor.job_execution_get_all(ctx, end_time=None): try: get_job_status(je.id) except Exception as e: LOG.error( _LE("Error during update job execution {job}: {error}").format( job=je.id, error=e))
def abort_and_log(status_code, descr, exc=None): LOG.error( _LE("Request aborted with status code {code} and " "message '{message}'").format(code=status_code, message=descr)) if exc is not None: LOG.error(traceback.format_exc()) flask.abort(status_code, description=descr)
def bad_request(error): error_code = 400 LOG.error(_LE("Validation Error occurred: " "error_code={code}, error_message={message}, " "error_name={name}").format(code=error_code, message=error.message, name=error.code)) return render_error_message(error_code, error.message, error.code)
def access_denied(error): error_code = 403 LOG.error(_LE("Access Denied: " "error_code={code}, error_message={message}, " "error_name={name}").format(code=error_code, message=error.message, name=error.code)) return render_error_message(error_code, error.message, error.code)
def not_found(error): error_code = 404 LOG.error(_LE("Not Found exception occurred: " "error_code={code}, error_message={message}, " "error_name={name}").format(code=error_code, message=error.message, name=error.code)) return render_error_message(error_code, error.message, error.code)