예제 #1
0
 def get_alerts_data(self, service=None):
     if self._data is not None:
         # return cached data
         return self._data.get(service, []) if service else self._data
     self._data = {}
     self._cluster_services = []
     try:
         ambari = plugin_utils.get_instance(self.cluster,
                                            p_common.AMBARI_SERVER)
         password = self.cluster.extra.get("ambari_password")
         with client.AmbariClient(ambari, password=password) as ambari:
             resp = ambari.get_alerts_data(self.cluster)
         for alert in resp:
             alert = alert.get('Alert', {})
             service = alert.get('service_name').lower()
             if service not in self._data:
                 self._data[service] = []
                 self._cluster_services.append(service)
             self._data[service].append(alert)
     except Exception as e:
         prefix = _("Can't get response from Ambari Monitor")
         msg = _("%(problem)s: %(description)s") % {
             'problem': prefix,
             'description': six.text_type(e)
         }
         # don't put in exception to logs, it will be done by log.exception
         LOG.exception(prefix)
         self._exception_store = msg
예제 #2
0
 def check_health(self):
     imp_map = {'OK': 'GREEN', 'WARNING': 'YELLOW', 'CRITICAL': 'RED'}
     other_map = {'OK': 'GREEN'}
     color_counter = collections.Counter()
     important_services = self.get_important_services()
     for alert in self.provider.get_alerts_data(self.service):
         alert_summary = alert.get('state', 'UNKNOWN')
         if self.service in important_services:
             target = imp_map.get(alert_summary, 'RED')
         else:
             target = other_map.get(alert_summary, 'YELLOW')
         color_counter[target] += 1
     if color_counter['RED'] > 0 and color_counter['YELLOW'] > 0:
         raise health_check_base.RedHealthError(
             _("Ambari Monitor has responded that cluster has "
               "%(red)d critical and %(yellow)d warning alert(s)") % {
                   'red': color_counter['RED'],
                   'yellow': color_counter['YELLOW']
               })
     elif color_counter['RED'] > 0:
         raise health_check_base.RedHealthError(
             _("Ambari Monitor has responded that cluster has "
               "%(red)d critical alert(s)") % {'red': color_counter['RED']})
     elif color_counter['YELLOW'] > 0:
         raise health_check_base.YellowHealthError(
             _("Ambari Monitor has responded that cluster "
               "has %d warning alert(s)") % color_counter['YELLOW'])
     return _("No alerts found")
예제 #3
0
def _check_storm(cluster):
    dr_count = utils.get_instances_count(cluster, common.DRPC_SERVER)
    ni_count = utils.get_instances_count(cluster, common.NIMBUS)
    su_count = utils.get_instances_count(cluster, common.STORM_UI_SERVER)
    sv_count = utils.get_instances_count(cluster, common.SUPERVISOR)
    if dr_count > 1:
        raise ex.InvalidComponentCountException(common.DRPC_SERVER,
                                                _("0 or 1"), dr_count)
    if ni_count > 1:
        raise ex.InvalidComponentCountException(common.NIMBUS, _("0 or 1"),
                                                ni_count)
    if su_count > 1:
        raise ex.InvalidComponentCountException(common.STORM_UI_SERVER,
                                                _("0 or 1"), su_count)
    if dr_count == 0 and ni_count == 1:
        raise ex.RequiredServiceMissingException(common.DRPC_SERVER,
                                                 required_by=common.NIMBUS)
    if dr_count == 1 and ni_count == 0:
        raise ex.RequiredServiceMissingException(
            common.NIMBUS, required_by=common.DRPC_SERVER)
    if su_count == 1 and (dr_count == 0 or ni_count == 0):
        raise ex.RequiredServiceMissingException(
            common.NIMBUS, required_by=common.STORM_UI_SERVER)
    if dr_count == 1 and sv_count == 0:
        raise ex.RequiredServiceMissingException(
            common.SUPERVISOR, required_by=common.DRPC_SERVER)
    if sv_count > 0 and dr_count == 0:
        raise ex.RequiredServiceMissingException(common.DRPC_SERVER,
                                                 required_by=common.SUPERVISOR)
예제 #4
0
 def check_health(self):
     important_services = self.provider.get_important_services()
     observed_data = self.provider.get_health_status(self.service)
     imp_map = {'BAD': 'red', 'CONCERNING': 'yellow', 'GOOD': 'green'}
     summary = observed_data['summary']
     checks = observed_data.get('checks', [])
     failed_checks = []
     for check in checks:
         if check['summary'] != 'GOOD':
             failed_checks.append('%(name)s - %(summary)s state' % {
                 'name': check['name'],
                 'summary': check['summary']
             })
     additional_info = None
     if failed_checks:
         additional_info = _("The following checks did not pass: %s"
                             ) % ",".join(failed_checks)
     if self.service in important_services:
         overall = imp_map.get(summary, 'red')
     else:
         overall = 'green'
         if summary != 'GOOD':
             overall = 'yellow'
     msg = _("Cloudera Manager has responded that service is in "
             "the %s state") % summary
     if additional_info:
         msg = _("%(problem)s. %(description)s") % {
             'problem': msg,
             'description': additional_info
         }
     if overall == 'red':
         raise health_check_base.RedHealthError(msg)
     elif overall == 'yellow':
         raise health_check_base.YellowHealthError(msg)
     return msg
예제 #5
0
    def _create_config_obj(self, item, target='general', scope='cluster',
                           high_priority=False):
        def _prepare_value(value):
            if isinstance(value, str):
                return value.strip().lower()
            return value

        conf_name = _prepare_value(item.get('name', None))

        conf_value = _prepare_value(item.get('value', None))

        if not conf_name:
            raise ex.HadoopProvisionError(_("Config missing 'name'"))

        if conf_value is None:
            raise ex.PluginInvalidDataException(
                _("Config '%s' missing 'value'") % conf_name)

        if high_priority or item.get('priority', 2) == 1:
            priority = 1
        else:
            priority = 2

        return p.Config(
            name=conf_name,
            applicable_target=target,
            scope=scope,
            config_type=item.get('config_type', "string"),
            config_values=item.get('config_values', None),
            default_value=conf_value,
            is_optional=item.get('is_optional', True),
            description=item.get('description', None),
            priority=priority)
예제 #6
0
    def _hue_validation(cls, cluster):
        hue_count = cls.get_inst_count(cluster, 'HUE_SERVER')
        if hue_count > 1:
            raise ex.InvalidComponentCountException('HUE_SERVER', _('0 or 1'),
                                                    hue_count)

        shs_count = cls.get_inst_count(cluster, 'SPARK_YARN_HISTORY_SERVER')
        hms_count = cls.get_inst_count(cluster, 'HIVE_METASTORE')
        oo_count = cls.get_inst_count(cluster, 'OOZIE_SERVER')
        rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER')

        if shs_count > 1:
            raise ex.InvalidComponentCountException(
                'SPARK_YARN_HISTORY_SERVER', _('0 or 1'), shs_count)
        if shs_count and not rm_count:
            raise ex.RequiredServiceMissingException(
                'YARN_RESOURCEMANAGER',
                required_by='SPARK_YARN_HISTORY_SERVER')

        if oo_count < 1 and hue_count:
            raise ex.RequiredServiceMissingException('OOZIE_SERVER',
                                                     required_by='HUE_SERVER')

        if hms_count < 1 and hue_count:
            raise ex.RequiredServiceMissingException('HIVE_METASTORE',
                                                     required_by='HUE_SERVER')
 def _await_cldb(self, cluster_context, instances=None, timeout=600):
     instances = instances or cluster_context.get_instances()
     cldb_node = cluster_context.get_instance(mfs.CLDB)
     start_time = timeutils.utcnow()
     retry_count = 0
     with cldb_node.remote() as r:
         LOG.debug("Waiting {count} seconds for CLDB initialization".format(
             count=timeout))
         while timeutils.delta_seconds(start_time,
                                       timeutils.utcnow()) < timeout:
             ec, out = r.execute_command(NODE_LIST_CMD,
                                         raise_when_error=False)
             resp = json.loads(out)
             status = resp['status']
             if str(status).lower() == 'ok':
                 ips = [n['ip'] for n in resp['data']]
                 retry_count += 1
                 for i in instances:
                     if (i.internal_ip not in ips
                             and (retry_count > DEFAULT_RETRY_COUNT)):
                         msg = _("Node failed to connect to CLDB: %s"
                                 ) % i.internal_ip
                         raise ex.HadoopProvisionError(msg)
                 break
             else:
                 context.sleep(DELAY)
         else:
             raise ex.HadoopProvisionError(_("CLDB failed to start"))
예제 #8
0
    def _hdfs_ha_validation(cls, cluster):
        jn_count = cls.get_inst_count(cluster, 'HDFS_JOURNALNODE')
        zk_count = cls.get_inst_count(cluster, 'ZOOKEEPER_SERVER')

        require_anti_affinity = cls.PU.c_helper.get_required_anti_affinity(
            cluster)

        if jn_count > 0:
            if jn_count < 3:
                raise ex.InvalidComponentCountException(
                    'HDFS_JOURNALNODE', _('not less than 3'), jn_count)
            if not jn_count % 2:
                raise ex.InvalidComponentCountException(
                    'HDFS_JOURNALNODE', _('be odd'), jn_count)
            if zk_count < 1:
                raise ex.RequiredServiceMissingException('ZOOKEEPER',
                                                         required_by='HDFS HA')
            if require_anti_affinity:
                if 'HDFS_SECONDARYNAMENODE' not in \
                        cls._get_anti_affinity(cluster):
                    raise ex.NameNodeHAConfigurationError(
                        _('HDFS_SECONDARYNAMENODE should be enabled '
                          'in anti_affinity.'))
                if 'HDFS_NAMENODE' not in cls._get_anti_affinity(cluster):
                    raise ex.NameNodeHAConfigurationError(
                        _('HDFS_NAMENODE should be enabled in anti_affinity.'))
예제 #9
0
    def _impala_validation(cls, cluster):
        ics_count = cls.get_inst_count(cluster, 'IMPALA_CATALOGSERVER')
        iss_count = cls.get_inst_count(cluster, 'IMPALA_STATESTORE')
        id_count = cls.get_inst_count(cluster, 'IMPALAD')
        dn_count = cls.get_inst_count(cluster, 'HDFS_DATANODE')
        hms_count = cls.get_inst_count(cluster, 'HIVE_METASTORE')

        if ics_count > 1:
            raise ex.InvalidComponentCountException('IMPALA_CATALOGSERVER',
                                                    _('0 or 1'), ics_count)
        if iss_count > 1:
            raise ex.InvalidComponentCountException('IMPALA_STATESTORE',
                                                    _('0 or 1'), iss_count)
        if ics_count == 1:
            datanode_ng = u.get_node_groups(cluster, "HDFS_DATANODE")
            impalad_ng = u.get_node_groups(cluster, "IMPALAD")
            datanodes = set(ng.id for ng in datanode_ng)
            impalads = set(ng.id for ng in impalad_ng)

            if datanodes != impalads:
                raise ex.InvalidClusterTopology(
                    _("IMPALAD must be installed on every HDFS_DATANODE"))

            if iss_count != 1:
                raise ex.RequiredServiceMissingException('IMPALA_STATESTORE',
                                                         required_by='IMPALA')
            if id_count < 1:
                raise ex.RequiredServiceMissingException('IMPALAD',
                                                         required_by='IMPALA')
            if dn_count < 1:
                raise ex.RequiredServiceMissingException('HDFS_DATANODE',
                                                         required_by='IMPALA')
            if hms_count < 1:
                raise ex.RequiredServiceMissingException('HIVE_METASTORE',
                                                         required_by='IMPALA')
예제 #10
0
    def _yarn_ha_validation(cls, cluster):
        rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER')
        zk_count = cls.get_inst_count(cluster, 'ZOOKEEPER_SERVER')
        stdb_rm_count = cls.get_inst_count(cluster, 'YARN_STANDBYRM')

        require_anti_affinity = cls.PU.c_helper.get_required_anti_affinity(
            cluster)

        if stdb_rm_count > 1:
            raise ex.InvalidComponentCountException('YARN_STANDBYRM',
                                                    _('0 or 1'), stdb_rm_count)
        if stdb_rm_count > 0:
            if rm_count < 1:
                raise ex.RequiredServiceMissingException(
                    'YARN_RESOURCEMANAGER', required_by='RM HA')
            if zk_count < 1:
                raise ex.RequiredServiceMissingException('ZOOKEEPER',
                                                         required_by='RM HA')
            if require_anti_affinity:
                if 'YARN_RESOURCEMANAGER' not in \
                        cls._get_anti_affinity(cluster):
                    raise ex.ResourceManagerHAConfigurationError(
                        _('YARN_RESOURCEMANAGER should be enabled in '
                          'anti_affinity.'))
                if 'YARN_STANDBYRM' not in cls._get_anti_affinity(cluster):
                    raise ex.ResourceManagerHAConfigurationError(
                        _('YARN_STANDBYRM should be'
                          ' enabled in anti_affinity.'))
예제 #11
0
def _get_ha_params():
    enable_namenode_ha = provisioning.Config(
        name=common.NAMENODE_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable NameNode HA"),
        priority=1)

    enable_resourcemanager_ha = provisioning.Config(
        name=common.RESOURCEMANAGER_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable ResourceManager HA"),
        priority=1)

    enable_regionserver_ha = provisioning.Config(
        name=common.HBASE_REGIONSERVER_HA,
        applicable_target="general",
        scope="cluster",
        config_type="bool",
        default_value=False,
        is_optional=True,
        description=_("Enable HBase RegionServer HA"),
        priority=1)

    return [
        enable_namenode_ha, enable_resourcemanager_ha, enable_regionserver_ha
    ]
예제 #12
0
    def _validate_existing_ng_scaling(self, cluster, existing):
        scalable_processes = self._get_scalable_processes()
        dn_to_delete = 0
        for ng in cluster.node_groups:
            if ng.id in existing:
                if ng.count > existing[ng.id] and ("datanode" in
                                                   ng.node_processes):
                    dn_to_delete += ng.count - existing[ng.id]
                if not set(ng.node_processes).issubset(scalable_processes):
                    raise ex.NodeGroupCannotBeScaled(
                        ng.name, _("Spark plugin cannot scale nodegroup"
                                   " with processes: %s") %
                        ' '.join(ng.node_processes))

        dn_amount = len(utils.get_instances(cluster, "datanode"))
        rep_factor = utils.get_config_value_or_default('HDFS',
                                                       "dfs.replication",
                                                       cluster)

        if dn_to_delete > 0 and dn_amount - dn_to_delete < rep_factor:
            raise ex.ClusterCannotBeScaled(
                cluster.name, _("Spark plugin cannot shrink cluster because "
                                "there would be not enough nodes for HDFS "
                                "replicas (replication factor is %s)") %
                rep_factor)
예제 #13
0
 def wait_ambari_requests(self, requests, cluster_name):
     requests = set(requests)
     failed = []
     context.sleep(20)
     while len(requests) > 0:
         completed, not_completed = set(), set()
         for req_id in requests:
             request = self.get_request_info(cluster_name, req_id)
             status = request.get("request_status")
             if status == 'COMPLETED':
                 completed.add(req_id)
             elif status in ['IN_PROGRESS', 'PENDING']:
                 not_completed.add(req_id)
             else:
                 failed.append(request)
         if failed:
             msg = _("Some Ambari request(s) "
                     "not in COMPLETED state: %(description)s.")
             descrs = []
             for req in failed:
                 descr = _(
                     "request %(id)d: %(name)s - in status %(status)s")
                 descrs.append(descr %
                               {'id': req.get("id"),
                                'name': req.get("request_context"),
                                'status': req.get("request_status")})
             raise p_exc.HadoopProvisionError(msg % {'description': descrs})
         requests = not_completed
         context.sleep(5)
         LOG.debug("Waiting for %d ambari request(s) to be completed",
                   len(not_completed))
     LOG.debug("All ambari requests have been completed")
예제 #14
0
 def get_service(self, node_process):
     ui_name = self.get_service_name_by_node_process(node_process)
     if ui_name is None:
         raise e.PluginInvalidDataException(
             _('Service not found in services list'))
     version = self.get_chosen_service_version(ui_name)
     service = self._find_service_instance(ui_name, version)
     if service is None:
         raise e.PluginInvalidDataException(_('Can not map service'))
     return service
예제 #15
0
def _check_jn_ha(cluster):
    jn_count = utils.get_instances_count(cluster, common.JOURNAL_NODE)
    if jn_count < 3:
        raise ex.InvalidComponentCountException(
            common.JOURNAL_NODE, _("3 or more. Odd number"), jn_count,
            _("At least 3 JournalNodes are required for HA"))
    if jn_count % 2 != 1:
        raise ex.InvalidComponentCountException(
            common.JOURNAL_NODE, _("Odd number"), jn_count,
            _("Odd number of JournalNodes are required for HA"))
예제 #16
0
def _check_zk_ha(cluster):
    zk_count = utils.get_instances_count(cluster, common.ZOOKEEPER_SERVER)
    if zk_count < 3:
        raise ex.InvalidComponentCountException(
            common.ZOOKEEPER_SERVER, _("3 or more. Odd number"), zk_count,
            _("At least 3 ZooKeepers are required for HA"))
    if zk_count % 2 != 1:
        raise ex.InvalidComponentCountException(
            common.ZOOKEEPER_SERVER, _("Odd number"), zk_count,
            _("Odd number of ZooKeepers are required for HA"))
 def stop(self, cluster_context, instances=None):
     instances = instances or cluster_context.get_instances()
     zookeepers = cluster_context.filter_instances(instances, mng.ZOOKEEPER)
     utils.add_provisioning_step(cluster_context.cluster.id,
                                 _("Stop ZooKeepers nodes"),
                                 len(zookeepers))
     self._stop_zk_nodes(zookeepers)
     utils.add_provisioning_step(cluster_context.cluster.id,
                                 _("Stop Warden nodes"), len(instances))
     self._stop_warden_on_nodes(instances)
예제 #18
0
    def _basic_validation(cls, cluster):

        mng_count = cls.get_inst_count(cluster, 'CLOUDERA_MANAGER')
        if mng_count != 1:
            raise ex.InvalidComponentCountException('CLOUDERA_MANAGER', 1,
                                                    mng_count)

        nn_count = cls.get_inst_count(cluster, 'HDFS_NAMENODE')
        if nn_count != 1:
            raise ex.InvalidComponentCountException('HDFS_NAMENODE', 1,
                                                    nn_count)

        snn_count = cls.get_inst_count(cluster, 'HDFS_SECONDARYNAMENODE')
        if snn_count != 1:
            raise ex.InvalidComponentCountException('HDFS_SECONDARYNAMENODE',
                                                    1, snn_count)
        dn_count = cls.get_inst_count(cluster, 'HDFS_DATANODE')
        replicas = cls.PU.get_config_value('HDFS', 'dfs_replication', cluster)
        if dn_count < replicas:
            raise ex.InvalidComponentCountException(
                'HDFS_DATANODE', replicas, dn_count,
                _('Number of datanodes must be not'
                  ' less than dfs_replication.'))

        du_reserved = cls.PU.get_config_value('DATANODE',
                                              'dfs_datanode_du_reserved',
                                              cluster)
        du_reserved = du_reserved / 1073741824.
        for node_group in cluster.node_groups:
            volume_size = node_group.volumes_size
            if volume_size and volume_size < du_reserved:
                raise ex.InvalidVolumeSizeException(volume_size, du_reserved)

        rm_count = cls.get_inst_count(cluster, 'YARN_RESOURCEMANAGER')
        if rm_count > 1:
            raise ex.InvalidComponentCountException('YARN_RESOURCEMANAGER',
                                                    _('0 or 1'), rm_count)

        hs_count = cls.get_inst_count(cluster, 'YARN_JOBHISTORY')
        if hs_count > 1:
            raise ex.InvalidComponentCountException('YARN_JOBHISTORY',
                                                    _('0 or 1'), hs_count)

        if rm_count > 0 and hs_count < 1:
            raise ex.RequiredServiceMissingException(
                'YARN_JOBHISTORY', required_by='YARN_RESOURCEMANAGER')

        nm_count = cls.get_inst_count(cluster, 'YARN_NODEMANAGER')
        if rm_count == 0:
            if nm_count > 0:
                raise ex.RequiredServiceMissingException(
                    'YARN_RESOURCEMANAGER', required_by='YARN_NODEMANAGER')
예제 #19
0
class NodeRequiredServiceMissingException(e.RequiredServiceMissingException):
    MISSING_MSG = _('Node "%(ng_name)s" is missing component %(component)s')
    REQUIRED_MSG = _('%(message)s, required by %(required_by)s')

    def __init__(self, service_name, ng_name, required_by=None):
        super(NodeRequiredServiceMissingException,
              self).__init__(service_name, required_by)
        args = {'ng_name': ng_name, 'component': service_name}
        self.message = (NodeRequiredServiceMissingException.MISSING_MSG % args)
        if required_by:
            args = {'message': self.message, 'required_by': required_by}
            self.message = (NodeRequiredServiceMissingException.REQUIRED_MSG %
                            args)
예제 #20
0
    def get_service_by_role(self, role, cluster=None, instance=None):
        if cluster:
            cm_cluster = self.get_cloudera_cluster(cluster)
        elif instance:
            cm_cluster = self.get_cloudera_cluster(instance.cluster)
        else:
            raise ValueError(_("'cluster' or 'instance' argument missed"))

        if role in ['NAMENODE', 'DATANODE', 'SECONDARYNAMENODE',
                    'HDFS_GATEWAY']:
            return cm_cluster.get_service(self.HDFS_SERVICE_NAME)
        elif role in ['RESOURCEMANAGER', 'NODEMANAGER', 'JOBHISTORY',
                      'YARN_GATEWAY']:
            return cm_cluster.get_service(self.YARN_SERVICE_NAME)
        elif role in ['OOZIE_SERVER']:
            return cm_cluster.get_service(self.OOZIE_SERVICE_NAME)
        elif role in ['HIVESERVER2', 'HIVEMETASTORE', 'WEBHCAT']:
            return cm_cluster.get_service(self.HIVE_SERVICE_NAME)
        elif role in ['HUE_SERVER']:
            return cm_cluster.get_service(self.HUE_SERVICE_NAME)
        elif role in ['SPARK_YARN_HISTORY_SERVER']:
            return cm_cluster.get_service(self.SPARK_SERVICE_NAME)
        elif role in ['SERVER']:
            return cm_cluster.get_service(self.ZOOKEEPER_SERVICE_NAME)
        elif role in ['MASTER', 'REGIONSERVER']:
            return cm_cluster.get_service(self.HBASE_SERVICE_NAME)
        elif role in ['AGENT']:
            return cm_cluster.get_service(self.FLUME_SERVICE_NAME)
        elif role in ['SENTRY_SERVER']:
            return cm_cluster.get_service(self.SENTRY_SERVICE_NAME)
        elif role in ['SQOOP_SERVER']:
            return cm_cluster.get_service(self.SQOOP_SERVICE_NAME)
        elif role in ['SOLR_SERVER']:
            return cm_cluster.get_service(self.SOLR_SERVICE_NAME)
        elif role in ['HBASE_INDEXER']:
            return cm_cluster.get_service(self.KS_INDEXER_SERVICE_NAME)
        elif role in ['CATALOGSERVER', 'STATESTORE', 'IMPALAD', 'LLAMA']:
            return cm_cluster.get_service(self.IMPALA_SERVICE_NAME)
        elif role in ['KMS']:
            return cm_cluster.get_service(self.KMS_SERVICE_NAME)
        elif role in ['JOURNALNODE']:
            return cm_cluster.get_service(self.HDFS_SERVICE_NAME)
        elif role in ['YARN_STANDBYRM']:
            return cm_cluster.get_service(self.YARN_SERVICE_NAME)
        elif role in ['KAFKA_BROKER']:
            return cm_cluster.get_service(self.KAFKA_SERVICE_NAME)
        else:
            raise ValueError(
                _("Process %(process)s is not supported by CDH plugin") %
                {'process': role})
예제 #21
0
def _check_hive(cluster):
    hs_count = utils.get_instances_count(cluster, common.HIVE_SERVER)
    hm_count = utils.get_instances_count(cluster, common.HIVE_METASTORE)
    if hs_count > 1:
        raise ex.InvalidComponentCountException(common.HIVE_SERVER,
                                                _("0 or 1"), hs_count)
    if hm_count > 1:
        raise ex.InvalidComponentCountException(common.HIVE_METASTORE,
                                                _("0 or 1"), hm_count)
    if hs_count == 0 and hm_count == 1:
        raise ex.RequiredServiceMissingException(
            common.HIVE_SERVER, required_by=common.HIVE_METASTORE)
    if hs_count == 1 and hm_count == 0:
        raise ex.RequiredServiceMissingException(
            common.HIVE_METASTORE, required_by=common.HIVE_SERVER)
예제 #22
0
def _check_ranger(cluster):
    ra_count = utils.get_instances_count(cluster, common.RANGER_ADMIN)
    ru_count = utils.get_instances_count(cluster, common.RANGER_USERSYNC)
    if ra_count > 1:
        raise ex.InvalidComponentCountException(common.RANGER_ADMIN,
                                                _("0 or 1"), ra_count)
    if ru_count > 1:
        raise ex.InvalidComponentCountException(common.RANGER_USERSYNC,
                                                _("0 or 1"), ru_count)
    if ra_count == 1 and ru_count == 0:
        raise ex.RequiredServiceMissingException(
            common.RANGER_USERSYNC, required_by=common.RANGER_ADMIN)
    if ra_count == 0 and ru_count == 1:
        raise ex.RequiredServiceMissingException(
            common.RANGER_ADMIN, required_by=common.RANGER_USERSYNC)
예제 #23
0
def validate_additional_ng_scaling(cluster, additional):
    rm = vu.get_resourcemanager(cluster)
    scalable_processes = _get_scalable_processes()

    for ng_id in additional:
        ng = u.get_by_id(cluster.node_groups, ng_id)
        if not set(ng.node_processes).issubset(scalable_processes):
            msg = _("Vanilla plugin cannot scale nodegroup with processes: %s")
            raise ex.NodeGroupCannotBeScaled(ng.name,
                                             msg % ' '.join(ng.node_processes))

        if not rm and 'nodemanager' in ng.node_processes:
            msg = _("Vanilla plugin cannot scale node group with processes "
                    "which have no master-processes run in cluster")
            raise ex.NodeGroupCannotBeScaled(ng.name, msg)
예제 #24
0
def _check_decommission(cluster, instances, check_func, option):
    utils.plugin_option_poll(cluster, is_decommissioned, option,
                             _("Wait for decommissioning"), 5, {
                                 'cluster': cluster,
                                 'check_func': check_func,
                                 'instances': instances
                             })
예제 #25
0
    def validate_job_execution(self, cluster, job, data):
        if not self.edp_supported(cluster.hadoop_version):
            raise ex.InvalidDataException(
                _('Storm {base} required to run {type} jobs').format(
                    base=EdpPyleusEngine.edp_base_version, type=job.type))

        super(EdpPyleusEngine, self).validate_job_execution(cluster, job, data)
예제 #26
0
    def invoke(self, method, relpath=None, params=None, data=None,
               headers=None):
        """Invoke an API method

        :return: Raw body or JSON dictionary (if response content type is
                 JSON).
        """
        path = self._join_uri(relpath)
        resp = self._client.execute(method,
                                    path,
                                    params=params,
                                    data=data,
                                    headers=headers)
        try:
            body = resp.read()
        except Exception as ex:
            raise ex.CMApiException(
                _("Command %(method)s %(path)s failed: %(msg)s")
                % {'method': method, 'path': path, 'msg': six.text_type(ex)})

        LOG.debug("{method} got response: {body}".format(method=method,
                                                         body=body[:32]))
        # Is the response application/json?
        if (len(body) != 0 and resp.info().getmaintype() == "application"
                and resp.info().getsubtype() == "json"):
            try:
                json_dict = json.loads(body)
                return json_dict
            except Exception:
                LOG.error('JSON decode error: {body}'.format(body=body))
                raise
        else:
            return body
예제 #27
0
    def get(self, relpath=None, params=None):
        """Invoke the GET method on a resource

        :param relpath: Optional. A relative path to this resource's path.
        :param params: Key-value data.

        :return: A dictionary of the JSON result.
        """
        for retry in six.moves.xrange(self.retries + 1):
            if retry:
                context.sleep(self.retry_sleep)
            try:
                return self.invoke("GET", relpath, params)
            except (socket.error, urllib.error.URLError) as e:
                if "timed out" in six.text_type(e).lower():
                    if retry < self.retries:
                        LOG.warning("Timeout issuing GET request for "
                                    "{path}. Will retry".format(
                                        path=self._join_uri(relpath)))
                    else:
                        LOG.warning("Timeout issuing GET request for "
                                    "{path}. No retries left".format(
                                        path=self._join_uri(relpath)))
                else:
                    raise
        else:
            raise ex.CMApiException(_("Get retry max time reached."))
예제 #28
0
    def validate_additional_ng_scaling(cls, cluster, additional):
        rm = cls.PU.get_resourcemanager(cluster)
        scalable_processes = cls._get_scalable_processes()

        for ng_id in additional:
            ng = u.get_by_id(cluster.node_groups, ng_id)
            if not set(ng.node_processes).issubset(scalable_processes):
                msg = _("CDH plugin cannot scale nodegroup with processes: "
                        "%(processes)s")
                raise ex.NodeGroupCannotBeScaled(
                    ng.name, msg % {'processes': ' '.join(ng.node_processes)})

            if not rm and 'YARN_NODEMANAGER' in ng.node_processes:
                msg = _("CDH plugin cannot scale node group with processes "
                        "which have no master-processes run in cluster")
                raise ex.NodeGroupCannotBeScaled(ng.name, msg)
예제 #29
0
def _check_yarn(cluster):
    rm_count = utils.get_instances_count(cluster, common.RESOURCEMANAGER)
    nm_count = utils.get_instances_count(cluster, common.NODEMANAGER)
    hs_count = utils.get_instances_count(cluster, common.HISTORYSERVER)
    at_count = utils.get_instances_count(cluster, common.APP_TIMELINE_SERVER)

    if cluster.cluster_configs.get("general",
                                   {}).get(common.RESOURCEMANAGER_HA):
        _check_zk_ha(cluster)

        if rm_count != 2:
            raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 2,
                                                    rm_count)
    else:
        if rm_count != 1:
            raise ex.InvalidComponentCountException(common.RESOURCEMANAGER, 1,
                                                    rm_count)

    if hs_count != 1:
        raise ex.InvalidComponentCountException(common.HISTORYSERVER, 1,
                                                hs_count)
    if at_count != 1:
        raise ex.InvalidComponentCountException(common.APP_TIMELINE_SERVER, 1,
                                                at_count)
    if nm_count == 0:
        raise ex.InvalidComponentCountException(common.NODEMANAGER,
                                                _("1 or more"), nm_count)
    def check_health(self):
        instances = self.cluster_context.get_instances(
            node_process=management.ZOOKEEPER)
        active_count = 0
        for instance in instances:
            if self._is_zookeeper_running(instance):
                active_count += 1

        if active_count == 0:
            raise health_check_base.RedHealthError(_(
                "ZooKeeper is not in running state"))

        if active_count < len(instances):
            raise health_check_base.YellowHealthError(_(
                "Some ZooKeeper processes are not in running state"))
        return _("ZooKeeper is in running state")