Exemplo n.º 1
0
    def _execute_atom(self, atom_fqdn):
        try:
            ns, atom_name = atom_fqdn.split(".atoms.")
            ns, obj_name = ns.split(".objects.")
            ns_str = ns.split(".")[-1]

            if "integrations" in ns:
                current_ns = getattr(NS.integrations, ns_str)
            else:
                current_ns = getattr(NS, ns_str)

            runnable_atom = current_ns.ns.get_atom(obj_name, atom_name)
            try:
                ret_val = runnable_atom(parameters=self.parameters).run()
                return ret_val
            except AtomExecutionFailedError:
                exc_type, exc_value, exc_traceback = sys.exc_info()
                raise FlowExecutionFailedError(
                    str(
                        traceback.format_exception(exc_type, exc_value,
                                                   exc_traceback)))

        except (KeyError, AttributeError) as ex:
            _msg = "Could not find atom {0}".format(atom_fqdn)
            logger.log("error",
                       NS.publisher_id, {"message": _msg},
                       job_id=self.job_id,
                       flow_id=self.parameters['flow_id'])

            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": _msg,
                                     "exception": ex
                                 }))

        return False
Exemplo n.º 2
0
def get_latest_stats(node, resource):
    try:
        node_name = central_store_util.get_node_name_from_id(node)
        stats = NS.time_series_db_manager.get_plugin().get_metric_stats(
            node_name, resource, 'latest')
        if stats == "[]" or not stats:
            raise TendrlPerformanceMonitoringException(
                'Stats not yet available in time series db')
        return re.findall('Current:(.+?)Max', stats)
    except (ValueError, urllib3.exceptions.HTTPError,
            TendrlPerformanceMonitoringException) as ex:
        Event(
            ExceptionMessage(priority="debug",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 'Failed to get latest stats of %s of '
                                 'node %s for node summary.' %
                                 (resource, node),
                                 "exception":
                                 ex
                             }))
        raise ex
Exemplo n.º 3
0
 def __init__(self):
     super(NotificationPluginManager, self).__init__()
     self.daemon = True
     try:
         self.load_plugins()
         notification_medium = []
         self.complete = threading.Event()
         for plugin in NotificationPlugin.plugins:
             notification_medium.append(plugin.name)
         NotificationMedia(media=notification_medium).save()
     except (AttributeError, SyntaxError, ValueError, KeyError, ImportError,
             etcd.EtcdException) as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher="notifier",
                              payload={
                                  "message":
                                  'Failed to intialize notification '
                                  'manager',
                                  "exception":
                                  ex
                              }))
         raise ex
Exemplo n.º 4
0
 def load_plugins(self):
     try:
         path = os.path.dirname(os.path.abspath(__file__)) + '/dbplugins'
         pkg = 'tendrl.performance_monitoring.time_series_db.dbplugins'
         for py in [f[:-3] for f in os.listdir(path)
                    if f.endswith('.py') and f != '__init__.py']:
             plugin_name = '.'.join([pkg, py])
             mod = importlib.import_module(plugin_name)
             clsmembers = inspect.getmembers(mod, inspect.isclass)
             for name, cls in clsmembers:
                 exec("from %s import %s" % (plugin_name, name))
     except (SyntaxError, ValueError, ImportError) as ex:
         Event(
             ExceptionMessage(
                 priority="debug",
                 publisher=NS.publisher_id,
                 payload={"message": 'Failed to load the time series db '
                                     'plugins.',
                          "exception": ex
                          }
             )
         )
         raise ex
 def cluster_nodes_summary(self, cluster_id):
     node_summaries = []
     node_ids = central_store_util.get_cluster_node_ids(cluster_id)
     for node_id in node_ids:
         try:
             node_summary = central_store_util.read(
                 '/monitoring/summary/nodes/%s' % node_id
             )
             node_summaries.append(node_summary)
         except EtcdKeyNotFound as ex:
             Event(
                 ExceptionMessage(
                     priority="debug",
                     publisher=NS.publisher_id,
                     payload={
                         "message": 'Error caught fetching node summary of'
                         ' node %s.' % node_id,
                         "exception": ex
                     }
                 )
             )
             continue
     return node_summaries
Exemplo n.º 6
0
 def __init__(self):
     super(NotificationPluginManager, self).__init__()
     try:
         self.load_plugins()
         notification_medium = []
         for plugin in NotificationPlugin.plugins:
             notification_medium.append(plugin.name)
         NS.notification_medium = notification_medium
         NotificationMedia(media=notification_medium).save()
         self.save_alertnotificationconfig()
     except (SyntaxError, ValueError, KeyError, etcd.EtcdKeyNotFound,
             etcd.EtcdConnectionFailed, etcd.EtcdException,
             NotificationPluginError) as ex:
         Event(
             ExceptionMessage(priority="error",
                              publisher="alerting",
                              payload={
                                  "message":
                                  'Failed to intialize notification '
                                  'manager',
                                  "exception":
                                  ex
                              }))
         raise AlertingError(str(ex))
 def _run(self):
     while not self._complete.is_set():
         cluster_summaries = []
         clusters = central_store_util.get_cluster_ids()
         for clusterid in clusters:
             gevent.sleep(0.1)
             try:
                 cluster_summary = self.parse_cluster(clusterid)
                 cluster_summaries.append(cluster_summary.copy())
                 cluster_summary.save(update=False)
             except EtcdKeyNotFound:
                 pass
             except (EtcdException, AttributeError) as ex:
                 Event(
                     ExceptionMessage(priority="debug",
                                      publisher=NS.publisher_id,
                                      payload={
                                          "message":
                                          'Error caught computing summary.',
                                          "exception": ex
                                      }))
                 continue
         NS.sds_monitoring_manager.compute_system_summary(cluster_summaries)
         gevent.sleep(60)
 def init_monitoring(self):
     try:
         node_dets = central_store_util.get_nodes_details()
         for node_det in node_dets:
             if (
                 node_det['node_id'] not in
                 self.monitoring_config_init_nodes
             ):
                 self.init_monitoring_on_node(node_det)
                 self.monitoring_config_init_nodes.append(
                     node_det['node_id']
                 )
     except TendrlPerformanceMonitoringException as ex:
         Event(
             ExceptionMessage(
                 priority="debug",
                 publisher=NS.publisher_id,
                 payload={"message": 'Failed to intialize monitoring '
                                     'configuration on nodes. ',
                          "exception": ex
                          }
             )
         )
         raise ex
    def _application(self, env, start_response):
        try:
            if env['PATH_INFO'] != '/grafana_callback':
                start_response('404 Not Found',
                               [('Content-Type', 'text/html')])
                response = [b'<h1>Alert Not Found</h1>']
            else:
                data = env['wsgi.input'].read()
                data = json.loads(data)
                self.alert_handler.handle_alert(data["ruleId"])
                start_response('200 OK', [('Content-Type', 'text/html')])
                response = [b'<h1>Alert Received</h1>']
        except (IOError, AssertionError) as ex:
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message":
                                     "Unable to read alert from socket",
                                     "exception": ex
                                 }))
            response = [b'<h1>Error in reading alert from socket</h1>']

        return response
Exemplo n.º 10
0
    def on_sync_object(self, data):

        assert data['fsid'] == self.fsid

        sync_object = copy.deepcopy(data['data'])

        sync_type = SYNC_OBJECT_STR_TYPE[data['type']]
        new_object = self.inject_sync_object(data['type'], data['version'],
                                             sync_object)
        self._request_coll.on_map(sync_type, new_object)
        if new_object:
            # Check and raise any alerts if required

            # TODO(team) Enabled the below if condition as when
            # alerting needed for cluster health, mon status, pool
            # status etc

            # if sync_type.str == "health":
            #    self._on_health(sync_object)
            # if sync_type.str == "mon_status":
            #    self._on_mon_status(sync_object)
            if sync_type.str == "osd_map":
                # self._on_pool_status(sync_object)
                self._on_osd_map(sync_object)

            NS.ceph.objects.SyncObject(
                updated=now(),
                sync_type=sync_type.str,
                version=new_object.version if isinstance(
                    new_object.version, int) else None,
                when=now(),
                data=data['data']).save(update=False)

            if sync_type.str == "health":
                NS.ceph.objects.GlobalDetails(
                    status=sync_object['overall_status']).save()
            if sync_type.str == "osd_map":
                # Pool out of band deletion handling
                try:
                    pools = NS._int.client.read(
                        "clusters/%s/Pools" % NS.tendrl_context.integration_id)
                    old_pool_ids = []
                    for pool in pools.leaves:
                        old_pool_ids.append(int(pool.key.split("/")[-1]))
                    new_pool_ids = []
                    for raw_pool in sync_object.get('pools', []):
                        new_pool_ids.append(raw_pool['pool'])
                    delete_pool_ids = set(old_pool_ids) - set(new_pool_ids)
                    for id in delete_pool_ids:
                        NS._int.client.delete(
                            "clusters/%s/Pools/%s" %
                            (NS.tendrl_context.integration_id, id),
                            recursive=True)
                except etcd.EtcdKeyNotFound as ex:
                    Event(
                        ExceptionMessage(priority="debug",
                                         publisher=NS.publisher_id,
                                         payload={
                                             "message":
                                             "No pools found \
                                     for ceph cluster %s" %
                                             NS.tendrl_context.integration_id,
                                             "exception":
                                             ex
                                         }))
                for raw_pool in sync_object.get('pools', []):
                    Event(
                        Message(priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Updating Pool %s" % raw_pool['pool_name']
                                }))
                    pool_type = 'replicated'
                    if 'erasure_code_profile' in raw_pool and \
                        raw_pool['erasure_code_profile'] != "":
                        pool_type = 'erasure_coded'
                    quota_enabled = False
                    if ('quota_max_objects' in raw_pool and
                        raw_pool['quota_max_objects'] > 0) or \
                        ('quota_max_bytes' in raw_pool and
                         raw_pool['quota_max_bytes'] > 0):
                        quota_enabled = True
                    NS.ceph.objects.Pool(
                        pool_id=raw_pool['pool'],
                        pool_name=raw_pool['pool_name'],
                        pg_num=raw_pool['pg_num'],
                        type=pool_type,
                        erasure_code_profile=raw_pool.get(
                            'erasure_code_profile'),
                        min_size=raw_pool['min_size'],
                        size=raw_pool.get('size', None),
                        quota_enabled=quota_enabled,
                        quota_max_objects=raw_pool['quota_max_objects'],
                        quota_max_bytes=raw_pool['quota_max_bytes'],
                    ).save()
                # Osd out of band deletion handling
                try:
                    osds = NS._int.client.read(
                        "clusters/%s/Osds" % NS.tendrl_context.integration_id)
                    old_osds = []
                    for osd in osds.leaves:
                        old_osds.append(str(osd.key.split("/")[-1]))
                    new_osds = []
                    for raw_osd in sync_object.get('osds', []):
                        new_osds.append(raw_osd['uuid'])
                    delete_osds = set(old_osds) - set(new_osds)
                    for id in delete_osds:
                        NS._int.client.delete(
                            "clusters/%s/Osds/%s" %
                            (NS.tendrl_context.integration_id, id),
                            recursive=True)
                except etcd.EtcdKeyNotFound as ex:
                    Event(
                        ExceptionMessage(priority="debug",
                                         publisher=NS.publisher_id,
                                         payload={
                                             "message":
                                             "key not found in etcd",
                                             "exception": ex
                                         }))
                for raw_osd in sync_object.get('osds', []):
                    Event(
                        Message(priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Updating OSD %s" % raw_osd['osd']
                                }))
                    osd_host = socket.gethostbyaddr(
                        raw_osd['public_addr'].split(':')[0])[0]
                    NS.ceph.objects.Osd(
                        id=raw_osd['osd'],
                        uuid=raw_osd['uuid'],
                        hostname=osd_host,
                        public_addr=raw_osd['public_addr'],
                        cluster_addr=raw_osd['cluster_addr'],
                        heartbeat_front_addr=raw_osd['heartbeat_front_addr'],
                        heartbeat_back_addr=raw_osd['heartbeat_back_addr'],
                        down_at=raw_osd['down_at'],
                        up_from=raw_osd['up_from'],
                        lost_at=raw_osd['lost_at'],
                        osd_up=raw_osd['up'],
                        osd_in=raw_osd['in'],
                        up_thru=raw_osd['up_thru'],
                        weight=str(raw_osd['weight']),
                        primary_affinity=str(raw_osd['primary_affinity']),
                        state=raw_osd['state'],
                        last_clean_begin=raw_osd['last_clean_begin'],
                        last_clean_end=raw_osd['last_clean_end']).save()
        else:
            Event(
                Message(priority="debug",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "ClusterMonitor.on_sync_object: "
                            "stale object received for %s" % data['type']
                        }))
Exemplo n.º 11
0
    def run(self):
        try:
            # Lock nodes
            flow_utils.acquire_node_lock(self.parameters)
            integration_id = self.parameters['TendrlContext.integration_id']
            if integration_id is None:
                raise FlowExecutionFailedError(
                    "TendrlContext.integration_id cannot be empty")

            supported_sds = NS.compiled_definitions.get_parsed_defs(
            )['namespace.tendrl']['supported_sds']
            sds_name = self.parameters["TendrlContext.sds_name"]
            if sds_name not in supported_sds:
                raise FlowExecutionFailedError("SDS (%s) not supported" %
                                               sds_name)

            ssh_job_ids = []
            ssh_job_ids = \
                flow_utils.gluster_create_ssh_setup_jobs(
                    self.parameters,
                    skip_current_node=True
                )

            while True:
                time.sleep(3)
                all_status = {}
                for job_id in ssh_job_ids:
                    job = NS.tendrl.objects.Job(job_id=job_id).load()
                    all_status[job_id] = job.status

                _failed = {
                    _jid: status
                    for _jid, status in all_status.iteritems()
                    if status == "failed"
                }
                if _failed:
                    raise FlowExecutionFailedError(
                        "SSH setup failed for jobs %s cluster %s" %
                        (str(_failed), integration_id))
                if all(
                    [status == "finished" for status in all_status.values()]):
                    logger.log("info",
                               NS.publisher_id, {
                                   "message":
                                   "SSH setup completed for all "
                                   "nodes in cluster %s" % integration_id
                               },
                               job_id=self.parameters['job_id'],
                               flow_id=self.parameters['flow_id'])

                    break

            # SSH setup jobs finished above, now install sds
            # bits and create cluster
            logger.log("info",
                       NS.publisher_id, {
                           "message":
                           "Expanding Gluster Storage"
                           " Cluster %s" % integration_id
                       },
                       job_id=self.parameters['job_id'],
                       flow_id=self.parameters['flow_id'])
            gluster_help.expand_gluster(self.parameters)
            logger.log(
                "info",
                NS.publisher_id, {
                    "message":
                    "SDS install/config completed on newly "
                    "expanded nodes, Please wait while "
                    "tendrl-node-agents detect sds details on the newly "
                    "expanded nodes %s" % self.parameters['Node[]']
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id'])

            # Wait till detected cluster in populated for nodes
            while True:
                time.sleep(3)
                all_status = []
                detected_cluster = ""
                different_cluster_id = False
                dc = ""
                for node in self.parameters['Node[]']:
                    try:
                        dc = NS.tendrl.objects.DetectedCluster(
                            node_id=node).load()
                        if not detected_cluster:
                            detected_cluster = dc.detected_cluster_id
                        else:
                            if detected_cluster != dc.detected_cluster_id:
                                all_status.append(False)
                                different_cluster_id = True
                                break
                        all_status.append(True)
                    except etcd.EtcdKeyNotFound:
                        all_status.append(False)
                if different_cluster_id:
                    raise FlowExecutionFailedError(
                        "Seeing different detected cluster id in"
                        " different nodes. %s and %s" %
                        (detected_cluster, dc.detected_cluster_id))

                if all_status:
                    if all(all_status):
                        break

            # Create the params list for import cluster flow
            new_params = dict()
            new_params['Node[]'] = self.parameters['Node[]']
            new_params['TendrlContext.integration_id'] = integration_id

            # Get node context for one of the nodes from list
            dc = NS.tendrl.objects.DetectedCluster(
                node_id=self.parameters['Node[]'][0]).load()
            sds_pkg_name = dc.sds_pkg_name
            new_params['import_after_expand'] = True
            sds_pkg_version = dc.sds_pkg_version
            new_params['DetectedCluster.sds_pkg_name'] = \
                sds_pkg_name
            new_params['DetectedCluster.sds_pkg_version'] = \
                sds_pkg_version

            tags = []
            for node in self.parameters['Node[]']:
                tags.append("tendrl/node_%s" % node)
            payload = {
                "tags": tags,
                "run": "tendrl.flows.ImportCluster",
                "status": "new",
                "parameters": new_params,
                "parent": self.parameters['job_id'],
                "type": "node"
            }
            _job_id = str(uuid.uuid4())
            # release lock before import cluster
            flow_utils.release_node_lock(self.parameters)

            NS.tendrl.objects.Job(job_id=_job_id,
                                  status="new",
                                  payload=payload).save()
            logger.log(
                "info",
                NS.publisher_id, {
                    "message":
                    "Please wait while Tendrl imports ("
                    "job_id: %s) newly expanded "
                    "%s storage nodes in cluster %s" %
                    (_job_id, sds_pkg_name,
                     NS.tendrl.objects.Cluster(
                         integration_id=integration_id).load().short_name)
                },
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id'])
        except Exception as ex:
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": ex.message,
                                     "exception": ex
                                 }))
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock if any exception came
            flow_utils.release_node_lock(self.parameters)
Exemplo n.º 12
0
    def _sync_rbds(self):
        try:
            pools = NS._int.client.read("clusters/%s/Pools" %
                                        NS.tendrl_context.integration_id,
                                        recursive=True)
            for child in pools._children:
                pool_id = child['key'].split('/')[-1]
                pool_name = NS._int.client.read(
                    "clusters/%s/Pools/%s/pool_name" %
                    (NS.tendrl_context.integration_id, pool_id)).value
                rbd_details = self._get_rbds(pool_name)
                # Rbd out of band delete handling
                try:
                    rbds = NS._int.client.read(
                        "clusters/%s/Pools/%s/Rbds" %
                        (NS.tendrl_context.integration_id, pool_id))
                    old_rbds = []
                    for rbd in rbds.leaves:
                        old_rbds.append(rbd.key.split("/")[-1])
                    new_rbds = []
                    for k, v in rbd_details.iteritems():
                        new_rbds.append(k)
                    delete_rbds = set(old_rbds) - set(new_rbds)
                    for id in delete_rbds:
                        NS._int.client.delete(
                            "clusters/%s/Pools/%s/Rbds/%s" %
                            (NS.tendrl_context.integration_id, pool_id, id),
                            recursive=True)
                except etcd.EtcdKeyNotFound as ex:
                    Event(
                        ExceptionMessage(
                            priority="debug",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "No rbds found for ceph cluster %s" %
                                NS.tendrl_context.integration_id,
                                "exception":
                                ex
                            }))
                for k, v in rbd_details.iteritems():
                    NS.ceph.objects.Rbd(
                        name=k,
                        size=v['size'],
                        pool_id=pool_id,
                        flags=v['flags'],
                        provisioned=self._to_bytes(v['provisioned'])
                        if v.get("provisioned") else None,
                        used=self._to_bytes(v['used'])).save()
                try:
                    rbds = NS._int.client.read(
                        "clusters/%s/Pools/%s/Rbds" %
                        (NS.tendrl_context.integration_id, pool_id))
                except etcd.EtcdKeyNotFound:
                    # no rbds for pool, continue
                    continue

                for entry in rbds.leaves:
                    fetched_rbd = NS.ceph.objects.Rbd(
                        pool_id=pool_id,
                        name=entry.key.split("Rbds/")[-1]).load()
                    if fetched_rbd.name not in rbd_details.keys():
                        NS._int.client.delete(
                            "clusters/%s/Pools/%s/Rbds/%s" %
                            (NS.tendrl_context.integration_id, pool_id,
                             fetched_rbd.name),
                            recursive=True)
        except etcd.EtcdKeyNotFound:
            pass
Exemplo n.º 13
0
    def _sync_ec_profiles(self):
        """Invokes the below CLI commands

        1.
        ```ceph osd erasure-code-profile ls```

        and required output format is a list of ec profiles separated with new
        lines as below

        ```
           default
           k4m2
        ```
        2.
        ```ceph osd erasure-code-profile get {name}```

        and the required output format is '=' separated values in multiple
        lines

        ```
           k=2
           m=1
           plugin=jerasure
           directory={dir}
        ```

        """
        required_ec_profiles = [(2, 1), (4, 2), (6, 3), (8, 4)]
        ec_profile_details = {}

        commands = ['osd', 'erasure-code-profile', 'ls']
        cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name, commands)
        if cmd_out['err'] == "":
            ec_profile_list = []
            for item in cmd_out['out'].split('\n'):
                if item != "":
                    ec_profile_list.append(item)

            for ec_profile in ec_profile_list:
                commands = ['osd', 'erasure-code-profile', 'get', ec_profile]
                cmd_out = ceph.ceph_command(NS.tendrl_context.cluster_name,
                                            commands)
                if cmd_out['err'] == "":
                    info = {}
                    for item in cmd_out['out'].split('\n'):
                        if item != "":
                            info[item.split('=')[0]] = \
                                item.split('=')[1].strip()
                            ec_profile_details[ec_profile] = info
        # Ec profile out of band delete handling
            try:
                ec_profiles = NS._int.client.read(
                    "clusters/%s/ECProfiles" %
                    (NS.tendrl_context.integration_id))
                old_ec_profiles = []
                for ec_profile in ec_profiles.leaves:
                    old_ec_profiles.append(ec_profile.key.split("/")[-1])
                new_ec_profiles = []
                for k, v in ec_profile_details.iteritems():
                    new_ec_profiles.append(k)
                delete_ec_profiles = set(old_ec_profiles) - set(
                    new_ec_profiles)
                for id in delete_ec_profiles:
                    NS._int.client.delete(
                        "clusters/%s/ECProfiles/%s" %
                        (NS.tendrl_context.integration_id, id),
                        recursive=True)
            except etcd.EtcdKeyNotFound as ex:
                Event(
                    ExceptionMessage(priority="debug",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message": "key not found in etcd",
                                         "exception": ex
                                     }))
        available_ec_profiles = []
        for k, v in ec_profile_details.iteritems():
            NS.ceph.objects.ECProfile(
                name=k,
                k=v['k'],
                m=v['m'],
                plugin=v.get('plugin'),
                directory=v.get('directory'),
                ruleset_failure_domain=v.get('ruleset_failure_domain')).save()
            available_ec_profiles.append((int(v['k']), int(v['m'])))

        # Create the missing ec_profile_details
        missing_ec_profiles = [
            item for item in required_ec_profiles
            if item not in available_ec_profiles
        ]
        for item in missing_ec_profiles:
            attrs = dict(name="k%sm%s" % (item[0], item[1]),
                         k=item[0],
                         m=item[1],
                         plugin='jerasure',
                         directory='/usr/lib/ceph/erasure-code')
            crud = Crud()
            crud.create("ec_profile", attrs)
Exemplo n.º 14
0
def sync():
    try:
        _keep_alive_for = int(NS.config.data.get("sync_interval", 10)) + 250
        interfaces = get_node_network()
        if len(interfaces) > 0:
            for interface in interfaces:
                NS.tendrl.objects.NodeNetwork(**interface).save(
                    ttl=_keep_alive_for)
                if interface['ipv4']:
                    for ipv4 in interface['ipv4']:
                        index_key = "/indexes/ip/%s" % ipv4
                        try:
                            NS._int.wclient.write(index_key,
                                                  NS.node_context.node_id,
                                                  prevExist=False)
                        except etcd.EtcdAlreadyExist:
                            pass
                            # TODO(team) add ipv6 support
                            # if interface['ipv6']:
                            #    for ipv6 in interface['ipv6']:
                            #        index_key = "/indexes/ip/%s/%s" % (ipv6,
                            #
                            # NS.node_context.node_id)
                            #        NS._int.wclient.write(index_key, 1)

        # global network
        if len(interfaces) > 0:
            for interface in interfaces:
                if interface["subnet"] is not "":
                    NS.node_agent.objects.GlobalNetwork(**interface).save(
                        ttl=_keep_alive_for)
        try:
            networks = NS._int.client.read("/networks")
            for network in networks.leaves:
                try:
                    # it will delete any node with empty network detail in
                    # subnet, if one entry present then deletion never happen
                    NS._int.wclient.delete(
                        "%s/%s" % (network.key, NS.node_context.node_id),
                        dir=True)
                    # it will delete any subnet dir when it is empty
                    # if one entry present then deletion never happen
                    NS._int.wclient.delete(network.key, dir=True)
                except (etcd.EtcdKeyNotFound, etcd.EtcdDirNotEmpty):
                    continue
        except etcd.EtcdKeyNotFound as ex:
            Event(
                ExceptionMessage(priority="debug",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": "Given key is not present in "
                                     "etcd .",
                                     "exception": ex
                                 }))
    except Exception as ex:
        _msg = "node_sync networks sync failed: " + ex.message
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message": _msg,
                                 "exception": ex
                             }))
Exemplo n.º 15
0
 def get_node_brick_status_counts(self, node_id):
     brick_status_wise_counts = {
         'stopped': 0,
         'total': 0,
         pm_consts.WARNING_ALERTS: 0,
         pm_consts.CRITICAL_ALERTS: 0
     }
     try:
         node_name = central_store_util.get_node_name_from_id(node_id)
     except EtcdKeyNotFound as ex:
         Event(
             ExceptionMessage(
                 priority="error",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Error fetching node name for node "
                     "%s" % node_id,
                     "exception": ex
                 }
             )
         )
         return brick_status_wise_counts
     try:
         ip_indexes = etcd_read_key('/indexes/ip')
     except EtcdKeyNotFound as ex:
         Event(
             ExceptionMessage(
                 priority="error",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Error fetching ip indexes",
                     "exception": ex
                 }
             )
         )
         return brick_status_wise_counts
     node_ip = ''
     for ip, indexed_node_id in ip_indexes.iteritems():
         if node_id == indexed_node_id:
             node_ip = ip
             break
     try:
         cluster_id = central_store_util.get_node_cluster_id(
             node_id
         )
         if cluster_id:
             bricks = self.get_cluster_bricks(cluster_id)
             for brick_path, brick_det in bricks.iteritems():
                 if (
                     brick_det['hostname'] == node_name or
                     brick_det['hostname'] == node_ip
                 ):
                     if (
                         'status' in brick_det and
                         brick_det['status'] == 'Stopped'
                     ):
                         brick_status_wise_counts['stopped'] = \
                             brick_status_wise_counts['stopped'] + 1
                     brick_status_wise_counts['total'] = \
                         brick_status_wise_counts['total'] + 1
         crit_alerts, warn_alerts = parse_resource_alerts(
             'brick',
             pm_consts.CLUSTER,
             cluster_id=cluster_id
         )
         count = 0
         for alert in crit_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[
             pm_consts.CRITICAL_ALERTS
         ] = count
         count = 0
         for alert in warn_alerts:
             if alert['node_id'] == node_id:
                 count = count + 1
         brick_status_wise_counts[
             pm_consts.WARNING_ALERTS
         ] = count
     except (
         TendrlPerformanceMonitoringException,
         AttributeError,
         ValueError,
         KeyError
     ) as ex:
         Event(
             Message(
                 priority="info",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Exception caught fetching node brick"
                     " status wise counts",
                     "exception": ex
                 }
             )
         )
     return brick_status_wise_counts
Exemplo n.º 16
0
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and
                    _cluster.current_job['status'] == 'failed') or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING' if current_status !=
                                        'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    index = 1
                    volumes = raw_data['Volumes']
                    # instantiating blivet class, this will be used for
                    # getting brick_device_details
                    b = blivet.Blivet()

                    # reset blivet during every sync to get latest information
                    # about storage devices in the machine
                    b.reset()
                    devicetree = b.devicetree
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._enable_disable_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
Exemplo n.º 17
0
 def get_node_osd_status_wise_counts(self, node_id):
     osds_in_node = []
     osd_status_wise_counts = {
         'total': 0,
         'down': 0,
         pm_consts.CRITICAL_ALERTS: 0,
         pm_consts.WARNING_ALERTS: 0
     }
     cluster_id = central_store_util.get_node_cluster_id(
         node_id
     )
     node_ip = ''
     ip_indexes = etcd_read_key('/indexes/ip')
     for ip, indexed_node_id in ip_indexes.iteritems():
         if node_id == indexed_node_id:
             node_ip = ip
     try:
         osds = etcd_read_key(
             '/clusters/%s/maps/osd_map/data/osds' % cluster_id
         )
         osds = ast.literal_eval(osds.get('osds', '[]'))
         for osd in osds:
             if (
                 node_ip in osd.get('cluster_addr', '') or
                 node_ip in osd.get('public_addr', '')
             ):
                 osds_in_node.append(osd.get('osd'))
                 if 'up' not in osd.get('state'):
                     osd_status_wise_counts['down'] = \
                         osd_status_wise_counts['down'] + 1
                 osd_status_wise_counts['total'] = \
                     osd_status_wise_counts['total'] + 1
         crit_alerts, warn_alerts = parse_resource_alerts(
             'osd',
             pm_consts.CLUSTER,
             cluster_id=cluster_id
         )
         count = 0
         for alert in crit_alerts:
             plugin_instance = alert['tags'].get('plugin_instance', '')
             if int(plugin_instance[len('osd_'):]) in osds_in_node:
                 count = count + 1
         osd_status_wise_counts[
             pm_consts.CRITICAL_ALERTS
         ] = count
         count = 0
         for alert in warn_alerts:
             plugin_instance = alert['tags'].get('plugin_instance', '')
             if int(plugin_instance[len('osd_'):]) in osds_in_node:
                 count = count + 1
         osd_status_wise_counts[
             pm_consts.WARNING_ALERTS
         ] = count
     except (
         EtcdException,
         AttributeError,
         KeyError,
         ValueError,
         TendrlPerformanceMonitoringException
     ) as ex:
         Event(
             ExceptionMessage(
                 priority="debug",
                 publisher=NS.publisher_id,
                 payload={
                     "message": "Exception caught computing node osd "
                     "counts",
                     "exception": ex
                 }
             )
         )
     return osd_status_wise_counts
Exemplo n.º 18
0
    def dispatch_notification(self, alert):
        server = None
        try:
            self.set_destinations()
            if (not self.user_configs or len(self.user_configs) == 0):
                log(
                    "error", "notifier", {
                        "message":
                        'No destinations configured to send'
                        'alert notification'
                    })
                return
        except (AttributeError, EtcdException, ValueError, KeyError,
                SyntaxError) as ex:
            Event(
                ExceptionMessage(priority="debug",
                                 publisher="notifier",
                                 payload={
                                     "message":
                                     'Exception caught attempting to set'
                                     ' %s email destinations' %
                                     str(alert.tags),
                                     "exception":
                                     ex
                                 }))
            return
        try:
            msg = self.format_message(alert)
            if not self.admin_config:
                log(
                    "debug", "notifier", {
                        "message":
                        'Detected alert %s.'
                        'But, admin config is a must to send'
                        ' notification' % msg
                    })
                return
            server = self.get_mail_client()
            server.ehlo()
            if self.admin_config['auth'] != "":
                server.login(self.admin_config['email_id'],
                             self.admin_config['email_pass'])
            server.sendmail(self.admin_config['email_id'], self.user_configs,
                            msg)
            log(
                "debug", "notifier", {
                    "message":
                    'Sent mail to %s to alert about %s' %
                    (self.user_configs, msg)
                })
        except (error, smtplib.SMTPException, smtplib.SMTPAuthenticationError,
                smtplib.socket.gaierror, smtplib.SMTPSenderRefused,
                Exception) as ex:
            Event(
                ExceptionMessage(priority="debug",
                                 publisher="notifier",
                                 payload={
                                     "message":
                                     'Exception caught attempting to email'
                                     '%s' % msg,
                                     "exception":
                                     ex
                                 }))

        finally:
            if server:
                server.close()
Exemplo n.º 19
0
    def run(self):
        try:
            integration_id = self.parameters['TendrlContext.integration_id']
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id
            ).load()

            # Lock nodes
            flow_utils.acquire_node_lock(self.parameters)
            NS.tendrl_context = NS.tendrl_context.load()

            # TODO(team) when Tendrl supports create/expand/shrink cluster
            # setup passwordless ssh for all gluster nodes with given
            # integration_id (check
            # /indexes/tags/tendrl/integration/$integration_id for list of
            # nodes in cluster

            node_list = self.parameters['Node[]']
            cluster_nodes = []
            if len(node_list) > 1:
                # This is the master node for this flow
                for node in node_list:
                    if NS.node_context.node_id != node:
                        new_params = self.parameters.copy()
                        new_params['Node[]'] = [node]
                        # create same flow for each node in node list except
                        #  $this
                        payload = {"tags": ["tendrl/node_%s" % node],
                                   "run": "tendrl.flows.ImportCluster",
                                   "status": "new",
                                   "parameters": new_params,
                                   "parent": self.parameters['job_id'],
                                   "type": "node"
                                   }
                        _job_id = str(uuid.uuid4())
                        cluster_nodes.append(_job_id)
                        NS.tendrl.objects.Job(
                            job_id=_job_id,
                            status="new",
                            payload=payload
                        ).save()
                        logger.log(
                            "info",
                            NS.publisher_id,
                            {"message": "ImportCluster %s (jobID: %s) :"
                                        "importing host %s" %
                             (_cluster.short_name, _job_id, node)},
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id']
                        )
            # Check if minimum required version of underlying gluster
            # cluster met. If not fail the import task
            # A sample output from "rpm -qa | grep glusterfs-server"
            # looks as below
            # `glusterfs-server-3.8.4-54.4.el7rhgs.x86_64`
            # In case of upstream build the format could be as below
            # `glusterfs-server-4.1dev-0.203.gitc3e1a2e.el7.centos.x86_64`
            # `glusterfs-server-3.12.8-0.0.el7.centos.x86_64.rpm`
            cmd = subprocess.Popen(
                'rpm -q glusterfs-server',
                shell=True,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
            )
            out, err = cmd.communicate()
            if out in [None, ""] or err:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to detect underlying cluster version"},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False
            lines = out.split('\n')
            build_no = None
            req_build_no = None
            ver_det = lines[0].split('glusterfs-server-')[-1].split('.')
            maj_ver = ver_det[0]
            min_ver = ver_det[1]
            if 'dev' in min_ver:
                min_ver = min_ver[0]
            rel = ver_det[2]
            if '-' in rel:
                build_no = rel.split('-')[-1]
                rel = rel.split('-')[0]
            reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs()[
                'namespace.tendrl'
            ]['min_reqd_gluster_ver']
            req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.')
            if '-' in req_rel:
                req_build_no = req_rel.split('-')[-1]
                req_rel = req_rel.split('-')[0]
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Checking minimum required version ("
                            "%s.%s.%s) of Gluster Storage" %
                 (req_maj_ver, req_min_ver, req_rel)},
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id']
            )
            ver_check_failed = False
            if int(maj_ver) < int(req_maj_ver):
                ver_check_failed = True
            else:
                if int(maj_ver) == int(req_maj_ver):
                    if int(min_ver) < int(req_min_ver):
                        ver_check_failed = True
                    else:
                        if int(min_ver) == int(req_min_ver):
                            if int(rel) < int(req_rel):
                                ver_check_failed = True
                            else:
                                if int(rel) == int(req_rel):
                                    if build_no is not None and \
                                        req_build_no is not None and \
                                        int(build_no) < int(req_build_no):
                                        ver_check_failed = True
            if ver_check_failed:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Error: Minimum required version "
                                "(%s.%s.%s) "
                     "doesnt match that of detected Gluster "
                                "Storage (%s.%s.%s)" %
                     (req_maj_ver, req_min_ver, req_rel,
                      maj_ver, min_ver, 0)},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

            ret_val, err = import_gluster(self.parameters)
            if not ret_val:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Error importing the cluster (integration_id:"
                                " %s). Error: %s" % (integration_id, err)
                     },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

            if len(node_list) > 1:
                logger.log(
                    "info",
                    NS.publisher_id,
                    {"message": "ImportCluster %s waiting for hosts %s "
                        "to be imported" % (_cluster.short_name, node_list)},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                loop_count = 0
                # Wait for (no of nodes) * 6 minutes for import to complete
                wait_count = (len(node_list) - 1) * 36
                while True:
                    child_jobs_failed = []
                    parent_job = NS.tendrl.objects.Job(
                        job_id=self.parameters['job_id']
                    ).load()
                    if loop_count >= wait_count:
                        logger.log(
                            "error",
                            NS.publisher_id,
                            {"message": "Import jobs on cluster(%s) not yet "
                             "complete on all nodes(%s). Timing out." %
                             (_cluster.short_name, str(node_list))},
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id']
                        )
                        # Marking child jobs as failed which did not complete
                        # as the parent job has timed out. This has to be done
                        # explicitly because these jobs will still be processed
                        # by the node-agent, and will keep it busy, which might
                        # defer the new jobs or lead to their timeout.
                        for child_job_id in parent_job.children:
                            child_job = NS.tendrl.objects.Job(
                                job_id=child_job_id
                            ).load()
                            if child_job.status not in ["finished", "failed"]:
                                child_job.status = "failed"
                                child_job.save()
                        return False
                    time.sleep(10)
                    completed = True
                    for child_job_id in parent_job.children:
                        child_job = NS.tendrl.objects.Job(
                            job_id=child_job_id
                        ).load()
                        if child_job.status not in ["finished", "failed"]:
                            completed = False
                        elif child_job.status == "failed":
                            child_jobs_failed.append(child_job.job_id)
                    if completed:
                        break
                    else:
                        loop_count += 1
                        continue
                if len(child_jobs_failed) > 0:
                    _msg = "Child jobs failed are %s" % child_jobs_failed
                    logger.log(
                        "error",
                        NS.publisher_id,
                        {"message": _msg},
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id']
                    )
                    return False
        except Exception as ex:
            # For traceback
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={
                        "message": ex.message,
                        "exception": ex
                    }
                )
            )
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock
            flow_utils.release_node_lock(self.parameters)

        return True
Exemplo n.º 20
0
    def run(self):
        try:
            # Lock nodes
            create_cluster_utils.acquire_node_lock(self.parameters)
            integration_id = self.parameters['TendrlContext.integration_id']
            sds_name = self.parameters['DetectedCluster.sds_pkg_name']

            if not self.parameters.get('import_after_expand', False) and \
                not self.parameters.get('import_after_create', False):

                # check if gdeploy in already provisioned in this cluster
                # if no it has to be provisioned here
                if sds_name.find("gluster") > -1 and \
                    not self.parameters.get("gdeploy_provisioned", False) and \
                    not self._probe_and_mark_provisioner(
                        self.parameters["Node[]"], integration_id
                    ):
                    create_cluster_utils.install_gdeploy()
                    create_cluster_utils.install_python_gdeploy()
                    ssh_job_ids = create_cluster_utils.gluster_create_ssh_setup_jobs(
                        self.parameters)

                    while True:
                        gevent.sleep(3)
                        all_status = {}
                        for job_id in ssh_job_ids:
                            all_status[job_id] = NS._int.client.read(
                                "/queue/%s/status" % job_id).value

                        _failed = {
                            _jid: status
                            for _jid, status in all_status.iteritems()
                            if status == "failed"
                        }
                        if _failed:
                            raise AtomExecutionFailedError(
                                "SSH setup failed for jobs %s cluster %s" %
                                (str(_failed), integration_id))
                        if all([
                                status == "finished"
                                for status in all_status.values()
                        ]):
                            Event(
                                Message(
                                    job_id=self.parameters['job_id'],
                                    flow_id=self.parameters['flow_id'],
                                    priority="info",
                                    publisher=NS.publisher_id,
                                    payload={
                                        "message":
                                        "SSH setup completed for all nodes in cluster %s"
                                        % integration_id
                                    }))
                            # set this node as gluster provisioner
                            tags = ["provisioner/%s" % integration_id]
                            NS.node_context = NS.node_context.load()
                            tags += NS.node_context.tags
                            NS.node_context.tags = list(set(tags))
                            NS.node_context.save()

                            # set gdeploy_provisioned to true so that no other nodes
                            # tries to configure gdeploy
                            self.parameters['gdeploy_provisioned'] = True
                            break

            NS.tendrl_context = NS.tendrl_context.load()
            NS.tendrl_context.integration_id = integration_id
            _detected_cluster = NS.tendrl.objects.DetectedCluster().load()
            NS.tendrl_context.cluster_id = _detected_cluster.detected_cluster_id
            NS.tendrl_context.cluster_name = _detected_cluster.detected_cluster_name
            NS.tendrl_context.sds_name = _detected_cluster.sds_pkg_name
            NS.tendrl_context.sds_version = _detected_cluster.sds_pkg_version
            NS.tendrl_context.save()
            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Registered Node %s with cluster %s" %
                            (NS.node_context.node_id,
                             NS.tendrl_context.integration_id)
                        }))

            node_list = self.parameters['Node[]']
            cluster_nodes = []
            if len(node_list) > 1:
                # This is the master node for this flow
                for node in node_list:
                    if NS.node_context.node_id != node:
                        new_params = self.parameters.copy()
                        new_params['Node[]'] = [node]
                        # create same flow for each node in node list except $this
                        payload = {
                            "tags": ["tendrl/node_%s" % node],
                            "run": "tendrl.flows.ImportCluster",
                            "status": "new",
                            "parameters": new_params,
                            "parent": self.parameters['job_id'],
                            "type": "node"
                        }
                        _job_id = str(uuid.uuid4())
                        cluster_nodes.append(_job_id)
                        Job(job_id=_job_id, status="new",
                            payload=payload).save()
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Importing (job: %s) Node %s to cluster %s"
                                    % (_job_id, node, integration_id)
                                }))

            if "ceph" in sds_name.lower():
                node_context = NS.node_context.load()
                is_mon = False
                for tag in node_context.tags:
                    mon_tag = NS.compiled_definitions.get_parsed_defs(
                    )['namespace.tendrl']['tags']['ceph-mon']
                    if mon_tag in tag:
                        is_mon = True
                if is_mon:
                    # Check if minimum required version of underlying ceph
                    # cluster met. If not fail the import task
                    detected_cluster = NS.tendrl.objects.DetectedCluster(
                    ).load()
                    detected_cluster_ver = detected_cluster.sds_pkg_version.split(
                        '.')
                    maj_ver = detected_cluster_ver[0]
                    min_ver = detected_cluster_ver[1]
                    reqd_ceph_ver = NS.compiled_definitions.get_parsed_defs(
                    )['namespace.tendrl']['min_reqd_ceph_ver']
                    req_maj_ver, req_min_ver, req_rel = reqd_ceph_ver.split(
                        '.')
                    Event(
                        Message(
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'],
                            priority="info",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Check: Minimum required version (%s.%s.%s) of Ceph Storage"
                                % (req_maj_ver, req_min_ver, req_rel)
                            }))
                    if int(maj_ver) < int(req_maj_ver) or \
                        int(min_ver) < int(req_min_ver):
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="error",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Error: Minimum required version (%s.%s.%s) "
                                    "doesnt match that of detected Ceph Storage (%s.%s.%s)"
                                    % (req_maj_ver, req_min_ver, req_rel,
                                       maj_ver, min_ver, 0)
                                }))

                        raise FlowExecutionFailedError(
                            "Detected ceph version: %s"
                            " is lesser than required version: %s" %
                            (detected_cluster.sds_pkg_version, reqd_ceph_ver))
                    import_ceph(self.parameters)
            else:
                # Check if minimum required version of underlying gluster
                # cluster met. If not fail the import task
                detected_cluster = NS.tendrl.objects.DetectedCluster().load()
                detected_cluster_ver = detected_cluster.sds_pkg_version.split(
                    '.')
                maj_ver = detected_cluster_ver[0]
                min_ver = detected_cluster_ver[1]
                reqd_gluster_ver = NS.compiled_definitions.get_parsed_defs(
                )['namespace.tendrl']['min_reqd_gluster_ver']
                req_maj_ver, req_min_ver, req_rel = reqd_gluster_ver.split('.')
                Event(
                    Message(
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Check: Minimum required version (%s.%s.%s) of Gluster Storage"
                            % (req_maj_ver, req_min_ver, req_rel)
                        }))
                if int(maj_ver) < int(req_maj_ver) or \
                    int(min_ver) < int(req_min_ver):
                    Event(
                        Message(
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'],
                            priority="error",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Error: Minimum required version (%s.%s.%s) "
                                "doesnt match that of detected Gluster Storage (%s.%s.%s)"
                                % (req_maj_ver, req_min_ver, req_rel, maj_ver,
                                   min_ver, 0)
                            }))

                    raise FlowExecutionFailedError(
                        "Detected gluster version: %s"
                        " is lesser than required version: %s" %
                        (detected_cluster.sds_pkg_version, reqd_gluster_ver))
                import_gluster(self.parameters)

            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Waiting for participant nodes %s to be "
                            "imported %s" % (node_list, integration_id)
                        }))

            # An import is sucessfull once all Node[] register to
            # /clusters/:integration_id/nodes/:node_id
            while True:
                _all_node_status = []
                gevent.sleep(3)
                for node_id in self.parameters['Node[]']:
                    _status = NS.tendrl.objects.ClusterNodeContext(node_id=node_id).exists() \
                        and NS.tendrl.objects.ClusterTendrlContext(
                            integration_id=integration_id
                        ).exists()
                    _all_node_status.append(_status)
                if _all_node_status:
                    if all(_all_node_status):
                        Event(
                            Message(
                                job_id=self.parameters['job_id'],
                                flow_id=self.parameters['flow_id'],
                                priority="info",
                                publisher=NS.publisher_id,
                                payload={
                                    "message":
                                    "Import Cluster completed for all nodes "
                                    "in cluster %s" % integration_id
                                }))

                        break

            Event(
                Message(job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                        priority="info",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Sucessfully imported cluster %s" % integration_id
                        }))
        except Exception as ex:
            # For traceback
            Event(
                ExceptionMessage(priority="error",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message": ex.message,
                                     "exception": ex
                                 }))
            # raising exception to mark job as failed
            raise ex
        finally:
            # release lock
            create_cluster_utils.release_node_lock(self.parameters)

        return True
Exemplo n.º 21
0
def sync():
    try:
        # platform plugins
        Event(
            Message(
                priority="debug",
                publisher=NS.publisher_id,
                payload={"message": "Running Platform detection"
                         }
            )
        )
        try:
            p_mgr = platform_manager.PlatformManager()
        except ValueError as ex:
            Event(
                ExceptionMessage(
                    priority="debug",
                    publisher=NS.publisher_id,
                    payload={
                        "message": 'Failed to init PlatformManager. \Error %s',
                        "exception": ex
                        }
                )
            )
            return
        # execute the platform plugins
        for plugin in p_mgr.get_available_plugins():
            platform_details = plugin.discover_platform()
            if len(platform_details.keys()) > 0:
                # update etcd
                try:
                    NS.platform = NS.tendrl.objects.Platform(
                        os=platform_details["Name"],
                        os_version=platform_details["OSVersion"],
                        kernel_version=platform_details["KernelVersion"],
                    )
                    NS.platform.save()

                except etcd.EtcdException as ex:
                    Event(
                        ExceptionMessage(
                            priority="debug",
                            publisher=NS.publisher_id,
                            payload={
                                "message": "Failed to update etcd . \Error %s",
                                "exception": ex
                            }
                        )
                    )
                break
    except Exception as ex:
        Event(
            ExceptionMessage(
                priority="error",
                publisher=NS.publisher_id,
                payload={"message": "node_sync "
                                    "OS Platform detection failed: " +
                                    ex.message,
                         "exception": ex}
            )
        )
Exemplo n.º 22
0
 def calculate_host_summary(self, node):
     gevent.sleep(0.1)
     cpu_usage = self.get_net_host_cpu_utilization(node)
     memory_usage = self.get_net_host_memory_utilization(node)
     storage_usage = self.get_net_storage_utilization(node)
     swap_usage = self.get_net_host_swap_utilization(node)
     alert_count = self.get_alert_count(node)
     sds_det = NS.sds_monitoring_manager.get_node_summary(node)
     old_summary = NodeSummary(node_id=node,
                               name='',
                               status='',
                               role='',
                               cluster_name='',
                               cpu_usage={
                                   'percent_used': '',
                                   'updated_at': ''
                               },
                               memory_usage={
                                   'percent_used': '',
                                   'updated_at': '',
                                   'used': '',
                                   'total': ''
                               },
                               storage_usage={
                                   'percent_used': '',
                                   'total': '',
                                   'used': '',
                                   'updated_at': ''
                               },
                               swap_usage={
                                   'percent_used': '',
                                   'updated_at': '',
                                   'used': '',
                                   'total': ''
                               },
                               sds_det={},
                               alert_count=alert_count)
     try:
         old_summary = old_summary.load()
     except EtcdKeyNotFound:
         pass
     except (EtcdConnectionFailed, Exception) as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher=NS.publisher_id,
                              payload={
                                  "message":
                                  'Failed to fetch previously computed '
                                  'summary from etcd.',
                                  "exception":
                                  ex
                              }))
         return
     if cpu_usage is None:
         cpu_usage = old_summary.cpu_usage
     if memory_usage is None:
         memory_usage = old_summary.memory_usage
     if storage_usage is None:
         storage_usage = old_summary.storage_usage
     if swap_usage is None:
         swap_usage = old_summary.swap_usage
     try:
         summary = NodeSummary(
             name=central_store_util.get_node_name_from_id(node),
             node_id=node,
             status=self.get_node_status(node),
             role=central_store_util.get_node_role(node),
             cluster_name=central_store_util.get_node_cluster_name(node),
             cpu_usage=cpu_usage,
             memory_usage=memory_usage,
             storage_usage=storage_usage,
             swap_usage=swap_usage,
             selinux_mode=central_store_util.get_node_selinux_mode(node),
             sds_det=sds_det,
             alert_count=alert_count)
         summary.save(update=False)
     except Exception as ex:
         Event(
             ExceptionMessage(priority="debug",
                              publisher=NS.publisher_id,
                              payload={
                                  "message":
                                  'Exception caught while trying to '
                                  'save summary for node %s' % str(node),
                                  "exception":
                                  ex
                              }))
Exemplo n.º 23
0
    def load(self):
        if "Message" not in self.__class__.__name__:
            try:
                # Generate current in memory object hash
                self.hash = self._hash()
                _hash_key = "/{0}/hash".format(self.value)
                _stored_hash = None
                try:
                    _stored_hash = NS._int.client.read(_hash_key).value
                except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex:
                    if type(ex) != etcd.EtcdKeyNotFound:
                        NS._int.reconnect()
                        _stored_hash = NS._int.client.read(_hash_key).value
                if self.hash == _stored_hash:
                    # No changes in stored object and current object,
                    # dont save current object to central store
                    return self
            except TypeError:
                # no hash for this object, save the current hash as is
                pass

        _copy = self._copy_vars()

        for item in _copy.render():
            try:
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": "Reading %s" % item['key']}))
            except KeyError:
                sys.stdout.write("Reading %s" % item['key'])

            try:
                etcd_resp = NS._int.client.read(item['key'], quorum=True)
            except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex:
                if type(ex) == etcd.EtcdKeyNotFound:
                    continue
                else:
                    NS._int.reconnect()
                    etcd_resp = NS._int.client.read(item['key'], quorum=True)

            value = etcd_resp.value
            if item['dir']:
                key = item['key'].split('/')[-1]
                dct = dict(key=value)
                if hasattr(_copy, item['name']):
                    dct = getattr(_copy, item['name'])
                    if type(dct) == dict:
                        dct[key] = value
                    else:
                        setattr(_copy, item['name'], dct)
                else:
                    setattr(_copy, item['name'], dct)
                continue

            # convert list, dict (json) to python based on definitions
            _type = self._defs.get("attrs", {}).get(item['name'],
                                                    {}).get("type")
            if _type:
                if _type.lower() in ['json', 'list']:
                    if value:
                        try:
                            value = json.loads(value.decode('utf-8'))
                        except ValueError as ex:
                            _msg = "Error load() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            Event(
                                ExceptionMessage(priority="debug",
                                                 publisher=NS.publisher_id,
                                                 payload={
                                                     "message": _msg,
                                                     "exception": ex
                                                 }))
                    else:
                        if _type.lower() == "list":
                            value = list()
                        if _type.lower() == "json":
                            value = dict()

            setattr(_copy, item['name'], value)
        return _copy
Exemplo n.º 24
0
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)]
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
Exemplo n.º 25
0
    def save(self, update=True, ttl=None):
        self.render()
        if "Message" not in self.__class__.__name__:
            try:
                # Generate current in memory object hash
                self.hash = self._hash()
                _hash_key = "/{0}/hash".format(self.value)
                _stored_hash = None
                try:
                    _stored_hash = NS._int.client.read(_hash_key).value
                except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex:
                    if type(ex) != etcd.EtcdKeyNotFound:
                        NS._int.reconnect()
                        _stored_hash = NS._int.client.read(_hash_key).value
                if self.hash == _stored_hash:
                    # No changes in stored object and current object,
                    # dont save current object to central store
                    if ttl:
                        etcd_utils.refresh(self.value, ttl)
                    return
            except TypeError:
                # no hash for this object, save the current hash as is
                pass

        if update:
            current_obj = self.load()
            for attr, val in vars(self).iteritems():
                if isinstance(val, (types.FunctionType,
                                    types.BuiltinFunctionType,
                                    types.MethodType, types.BuiltinMethodType,
                                    types.UnboundMethodType)) or \
                        attr.startswith("_") or attr in ['value', 'list']:
                    continue

                if val is None and hasattr(current_obj, attr):
                    # if self.attr is None, use attr value from central
                    # store (i.e. current_obj.attr)
                    if getattr(current_obj, attr):
                        setattr(self, attr, getattr(current_obj, attr))

        self.updated_at = str(time_utils.now())
        for item in self.render():
            '''
                Note: Log messages in this file have try-except
                blocks to run
                in the condition when the node_agent has not been
                started and
                name spaces are being created.
            '''
            try:
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={
                                "message":
                                "Writing %s to %s" %
                                (item['key'], item['value'])
                            }))
            except KeyError:
                sys.stdout.write("Writing %s to %s" %
                                 (item['key'], item['value']))
            # convert list, dict (json) to python based on definitions
            _type = self._defs.get("attrs", {}).get(item['name'],
                                                    {}).get("type")
            if _type:
                if _type.lower() in ['json', 'list']:
                    if item['value']:
                        try:
                            item['value'] = json.dumps(item['value'])
                        except ValueError as ex:
                            _msg = "Error save() attr %s for object %s" % \
                                   (item['name'], self.__name__)
                            Event(
                                ExceptionMessage(priority="debug",
                                                 publisher=NS.publisher_id,
                                                 payload={
                                                     "message": _msg,
                                                     "exception": ex
                                                 }))
            try:
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
            except (etcd.EtcdConnectionFailed, etcd.EtcdException):
                NS._int.wreconnect()
                NS._int.wclient.write(item['key'], item['value'], quorum=True)
        if ttl:
            etcd_utils.refresh(self.value, ttl)
Exemplo n.º 26
0
def brick_status_alert(hostname):
    try:
        # fetching brick details of disconnected node
        lock = None
        path = "clusters/%s/Bricks/all/%s" % (
            NS.tendrl_context.integration_id,
            hostname
        )
        lock = etcd.Lock(
            NS._int.client,
            path
        )
        lock.acquire(
            blocking=True,
            lock_ttl=60
        )
        if lock.is_acquired:
            bricks = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                fqdn=hostname
            ).load_all()
            for brick in bricks:
                if brick.status.lower() == BRICK_STARTED:
                    # raise an alert for brick
                    msg = (
                        "Brick:%s in volume:%s has %s") % (
                            brick.brick_path,
                            brick.vol_name,
                            BRICK_STOPPED.title()
                        )
                    instance = "volume_%s|brick_%s" % (
                        brick.vol_name,
                        brick.brick_path,
                    )
                    event_utils.emit_event(
                        "brick_status",
                        BRICK_STOPPED.title(),
                        msg,
                        instance,
                        'WARNING',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": brick.vol_name,
                              "node_id": brick.node_id,
                              "fqdn": brick.hostname
                              }
                    )
                    # Update brick status as stopped
                    brick.status = BRICK_STOPPED.title()
                    brick.save()
                    lock.release()
    except (
        etcd.EtcdException,
        KeyError,
        ValueError,
        AttributeError
    ) as ex:
        Event(
            ExceptionMessage(
                priority="error",
                publisher=NS.publisher_id,
                payload={
                    "message": "Unable to raise an brick status "
                               "alert for host %s" % hostname,
                    "exception": ex
                }
            )
        )
    finally:
        if isinstance(lock, etcd.lock.Lock) and lock.is_acquired:
            lock.release()
Exemplo n.º 27
0
def process_job(job):
    jid = job.key.split('/')[-1]
    job_status_key = "/queue/%s/status" % jid
    job_lock_key = "/queue/%s/locked_by" % jid
    NS.node_context = NS.node_context.load()
    # Check job not already locked by some agent
    try:
        _locked_by = etcd_utils.read(job_lock_key).value
        if _locked_by:
            return
    except etcd.EtcdKeyNotFound:
        pass

    # Check job not already "finished", or "processing"
    try:
        _status = etcd_utils.read(job_status_key).value
        if _status in ["finished", "processing"]:
            return
    except etcd.EtcdKeyNotFound:
        pass

    try:
        _job_timeout_key = "/queue/%s/timeout" % jid
        _timeout = None
        _timeout = etcd_utils.read(_job_timeout_key).value
        if _timeout:
            _timeout = _timeout.lower()
    except etcd.EtcdKeyNotFound:
        pass

    # tendrl-node-agent tagged as tendrl/monitor will ensure
    # >10 min old "new" jobs are timed out and marked as
    # "failed" (the parent job of these jobs will also be
    # marked as "failed")
    if "tendrl/monitor" in NS.node_context.tags and \
        _timeout == "yes":
        _job_valid_until_key = "/queue/%s/valid_until" % jid
        _valid_until = None
        try:
            _valid_until = etcd_utils.read(
                _job_valid_until_key).value
        except etcd.EtcdKeyNotFound:
            pass

        if _valid_until:
            _now_epoch = (time_utils.now() -
                          datetime.datetime(1970, 1,
                                            1).replace(
                              tzinfo=utc)).total_seconds()
            if int(_now_epoch) >= int(_valid_until):
                # Job has "new" status since 10 minutes,
                # mark status as "failed" and Job.error =
                # "Timed out"
                try:
                    etcd_utils.write(job_status_key,
                                     "failed",
                                     prevValue="new")
                except etcd.EtcdCompareFailed:
                    pass
                else:
                    job = NS.tendrl.objects.Job(job_id=jid).load()
                    _msg = str("Timed-out (>10min as 'new')")
                    job.errors = _msg
                    job.save()
                    if job.payload.get('parent') is None:
                        alert_utils.alert_job_status(
                            "failed",
                            "Job timed out (job_id: %s)" % jid,
                            integration_id=NS.tendrl_context.integration_id or
                            job.payload['parameters'].get(
                                'TendrlContext.integration_id'
                            ),
                            cluster_name=NS.tendrl_context.cluster_name or
                            job.payload['parameters'].get(
                                'TendrlContext.cluster_name'
                            )
                        )
                    return
        else:
            _now_plus_10 = time_utils.now() + datetime.timedelta(minutes=10)
            _epoch_start = datetime.datetime(1970, 1, 1).replace(tzinfo=utc)

            # noinspection PyTypeChecker
            _now_plus_10_epoch = (_now_plus_10 -
                                  _epoch_start).total_seconds()
            etcd_utils.write(_job_valid_until_key,
                             int(_now_plus_10_epoch))

    job = NS.tendrl.objects.Job(job_id=jid).load()
    if job.payload["type"] == NS.type and \
            job.status == "new":
        # Job routing
        # Flows created by tendrl-api use 'tags' from flow
        # definition to target jobs
        _tag_match = False
        if job.payload.get("tags", []):
            for flow_tag in job.payload['tags']:
                if flow_tag in NS.node_context.tags:
                    _tag_match = True

        if not _tag_match:
            _job_tags = ", ".join(job.payload.get("tags", []))
            _msg = "Node (%s)(type: %s)(tags: %s) will not " \
                   "process job-%s (tags: %s)" % \
                   (NS.node_context.node_id, NS.type,
                    NS.node_context.tags, jid,
                    _job_tags)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": _msg}
            )
            return

        job_status_key = "/queue/%s/status" % job.job_id
        job_lock_key = "/queue/%s/locked_by" % job.job_id
        try:
            lock_info = dict(node_id=NS.node_context.node_id,
                             fqdn=NS.node_context.fqdn,
                             tags=NS.node_context.tags,
                             type=NS.type)
            etcd_utils.write(job_status_key, "processing",
                             prevValue="new")
            etcd_utils.write(job_lock_key,
                             json.dumps(lock_info))
        except etcd.EtcdCompareFailed:
            # job is already being processed by some tendrl
            # agent
            return

        the_flow = None
        try:
            current_ns, flow_name, obj_name = \
                _extract_fqdn(job.payload['run'])

            if obj_name:
                runnable_flow = current_ns.ns.get_obj_flow(
                    obj_name, flow_name)
            else:
                runnable_flow = current_ns.ns.get_flow(flow_name)

            the_flow = runnable_flow(parameters=job.payload[
                'parameters'], job_id=job.job_id)
            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Processing Job %s" %
                            job.job_id},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Running Flow %s" %
                            job.payload['run']},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id']
            )
            the_flow.run()
            try:
                etcd_utils.write(job_status_key,
                                 "finished",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'finished', " \
                       "current job status invalid"
                raise FlowExecutionFailedError(_msg)

            logger.log(
                "info",
                NS.publisher_id,
                {"message": "Job (%s):  Finished "
                            "Flow %s" % (
                                job.job_id,
                                job.payload['run'])},
                job_id=job.job_id,
                flow_id=the_flow.parameters['flow_id'],
            )
            if job.payload.get('parent') is None:
                alert_utils.alert_job_status(
                    "finished",
                    "Job finished successfully (job_id: %s)" % job.job_id,
                    integration_id=NS.tendrl_context.integration_id or
                    job.payload['parameters'].get(
                        'TendrlContext.integration_id'
                    ),
                    cluster_name=NS.tendrl_context.cluster_name or
                    job.payload['parameters'].get(
                        'TendrlContext.cluster_name'
                    )
                )
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as e:
            _trace = str(traceback.format_exc(e))
            _msg = "Failure in Job %s Flow %s with error:" % \
                   (job.job_id, job.payload['run'])
            Event(
                ExceptionMessage(
                    priority="error",
                    publisher=NS.publisher_id,
                    payload={"message": _msg + _trace,
                             "exception": e
                             }
                )
            )
            if the_flow:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace},
                    job_id=job.job_id,
                    flow_id=the_flow.parameters['flow_id']
                )
            else:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg + "\n" + _trace}
                )

            try:
                etcd_utils.write(job_status_key,
                                 "failed",
                                 prevValue="processing")
            except etcd.EtcdCompareFailed:
                # This should not happen!
                _msg = "Cannot mark job as 'failed', current" \
                       "job status invalid"
                raise FlowExecutionFailedError(_msg)
            else:
                job = job.load()
                job.errors = _trace
                if job.payload.get('parent') is None:
                    alert_utils.alert_job_status(
                        "failed",
                        "Job failed (job_id: %s)" % job.job_id,
                        integration_id=NS.tendrl_context.integration_id or
                        job.payload['parameters'].get(
                            'TendrlContext.integration_id'
                        ),
                        cluster_name=NS.tendrl_context.cluster_name or
                        job.payload['parameters'].get(
                            'TendrlContext.cluster_name'
                        )
                    )
                job.save()
Exemplo n.º 28
0
def sync():
    try:
        Event(
            Message(priority="debug",
                    publisher=NS.publisher_id,
                    payload={"message": "Running SDS detection"}))
        try:
            sds_discovery_manager = sds_manager.SDSDiscoveryManager()
        except ValueError as ex:
            Event(
                ExceptionMessage(priority="debug",
                                 publisher=NS.publisher_id,
                                 payload={
                                     "message":
                                     "Failed to init SDSDiscoveryManager.",
                                     "exception": ex
                                 }))
            return

        # Execute the SDS discovery plugins and tag the nodes with data
        for plugin in sds_discovery_manager.get_available_plugins():
            sds_details = plugin.discover_storage_system()
            if ('detected_cluster_id' in sds_details
                    and sds_details['detected_cluster_id'] != ""):
                if sds_details:
                    try:
                        dc = NS.tendrl.objects.DetectedCluster().load()
                        dc_changed = False
                        if dc.detected_cluster_id:
                            if dc.detected_cluster_id != sds_details.get(
                                    'detected_cluster_id'):
                                dc_changed = True
                        else:
                            time.sleep(3)

                        integration_index_key = \
                            "indexes/detected_cluster_id_to_integration_id/" \
                            "%s" % sds_details['detected_cluster_id']
                        try:
                            if dc_changed:
                                integration_id = \
                                    NS.tendrl_context.integration_id
                                NS._int.wclient.write(integration_index_key,
                                                      integration_id)
                            else:
                                integration_id = str(uuid.uuid4())
                                NS._int.wclient.write(integration_index_key,
                                                      integration_id,
                                                      prevExist=False)
                        except etcd.EtcdAlreadyExist:
                            if not dc_changed:
                                integration_id = NS._int.client.read(
                                    integration_index_key).value
                        finally:
                            NS.tendrl_context.integration_id = integration_id
                            NS.tendrl_context.cluster_id = sds_details.get(
                                'detected_cluster_id')
                            NS.tendrl_context.cluster_name = sds_details.get(
                                'detected_cluster_name')
                            NS.tendrl_context.sds_name = sds_details.get(
                                'pkg_name')
                            NS.tendrl_context.sds_version = sds_details.get(
                                'pkg_version')
                            NS.tendrl_context.save()

                        NS.node_context = NS.node_context.load()
                        integration_tag = "tendrl/integration/%s" % \
                                          integration_id
                        detected_cluster_tag = "detected_cluster/%s" % \
                                               sds_details[
                                                   'detected_cluster_id']
                        NS.node_context.tags += [
                            detected_cluster_tag, integration_tag
                        ]
                        NS.node_context.tags = list(set(NS.node_context.tags))
                        NS.node_context.save()
                        _cluster = NS.tendrl.objects.Cluster(
                            integration_id=NS.tendrl_context.integration_id
                        ).load()

                        NS.tendrl.objects.DetectedCluster(
                            detected_cluster_id=sds_details.get(
                                'detected_cluster_id'),
                            detected_cluster_name=sds_details.get(
                                'detected_cluster_name'),
                            sds_pkg_name=sds_details.get('pkg_name'),
                            sds_pkg_version=sds_details.get('pkg_version'),
                        ).save()

                        if _cluster.is_managed == "yes":
                            continue
                        else:
                            _cluster.is_managed = "no"
                            _cluster.save()

                    except (etcd.EtcdException, KeyError) as ex:
                        Event(
                            ExceptionMessage(priority="debug",
                                             publisher=NS.publisher_id,
                                             payload={
                                                 "message":
                                                 "Failed SDS detection",
                                                 "exception": ex
                                             }))
                    break
    except Exception as ex:
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 "node_sync "
                                 "SDS detection failed: " + ex.message,
                                 "exception":
                                 ex
                             }))
def sync(sync_ttl=None):
    try:
        tags = []
        # update node agent service details
        logger.log("debug", NS.publisher_id,
                   {"message": "node_sync, Updating Service data"})
        for service in TENDRL_SERVICES:
            s = NS.tendrl.objects.Service(service=service)
            if s.running:
                service_tag = NS.compiled_definitions.get_parsed_defs(
                )['namespace.tendrl']['tags'][service.strip("@*")]
                tags.append(service_tag)

                if service_tag == "tendrl/server":
                    tags.append("tendrl/monitor")
            s.save()

        if "tendrl/monitor" not in tags and \
            NS.tendrl_context.integration_id:
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            # Try to claim orphan "provisioner_%integration_id" tag
            _tag = "provisioner/%s" % _cluster.integration_id
            _is_new_provisioner = False
            NS.node_context = NS.tendrl.objects.NodeContext().load()
            if _tag not in NS.node_context.tags:
                try:
                    _index_key = "/indexes/tags/%s" % _tag
                    _node_id = json.dumps([NS.node_context.node_id])
                    etcd_utils.write(_index_key, _node_id, prevExist=False)
                    etcd_utils.refresh(_index_key, sync_ttl + 50)
                    tags.append(_tag)
                    _is_new_provisioner = True
                except etcd.EtcdAlreadyExist:
                    pass

        # updating node context with latest tags
        logger.log(
            "debug", NS.publisher_id,
            {"message": "node_sync, updating node context "
             "data with tags"})
        NS.node_context = NS.tendrl.objects.NodeContext().load()
        current_tags = list(NS.node_context.tags)
        tags += current_tags
        NS.node_context.tags = list(set(tags))
        NS.node_context.tags.sort()
        current_tags.sort()
        if NS.node_context.tags != current_tags:
            NS.node_context.save()

        if "tendrl/monitor" not in tags and \
            NS.tendrl_context.integration_id:
            _cluster = _cluster.load()
            if _is_new_provisioner and _cluster.is_managed == "yes":
                _msg = "node_sync, NEW provisioner node found! "\
                    "re-configuring monitoring (job-id: %s) on this node"
                payload = {
                    "tags": ["tendrl/node_%s" % NS.node_context.node_id],
                    "run": "tendrl.flows.ConfigureMonitoring",
                    "status": "new",
                    "parameters": {
                        'TendrlContext.integration_id':
                        NS.tendrl_context.integration_id
                    },
                    "type": "node"
                }
                _job_id = str(uuid.uuid4())
                NS.tendrl.objects.Job(job_id=_job_id,
                                      status="new",
                                      payload=payload).save()
                logger.log("debug", NS.publisher_id,
                           {"message": _msg % _job_id})

        # Update /indexes/tags/:tag = [node_ids]
        for tag in NS.node_context.tags:

            index_key = "/indexes/tags/%s" % tag
            _node_ids = []
            try:
                _node_ids = etcd_utils.read(index_key).value
                _node_ids = json.loads(_node_ids)
            except etcd.EtcdKeyNotFound:
                pass

            if _node_ids:
                if "provisioner" in tag:
                    # Check if this is a stale provisioner
                    if NS.node_context.node_id != _node_ids[0]:
                        NS.node_context.tags.remove(tag)
                        NS.node_context.save()
                        continue
                if NS.node_context.node_id in _node_ids:
                    if sync_ttl and len(_node_ids) == 1:
                        etcd_utils.refresh(index_key, sync_ttl + 50)

                    continue
                else:
                    _node_ids += [NS.node_context.node_id]
            else:
                _node_ids = [NS.node_context.node_id]
            _node_ids = list(set(_node_ids))

            etcd_utils.write(index_key, json.dumps(_node_ids))
            if sync_ttl and len(_node_ids) == 1:
                etcd_utils.refresh(index_key, sync_ttl + 50)
        logger.log("debug", NS.publisher_id,
                   {"message": "node_sync, Updating detected "
                    "platform"})
    except Exception as ex:
        Event(
            ExceptionMessage(priority="error",
                             publisher=NS.publisher_id,
                             payload={
                                 "message":
                                 "node_sync service and indexes "
                                 "sync failed: " + ex.message,
                                 "exception":
                                 ex
                             }))
Exemplo n.º 30
0
    def load_definition(self):
        cls_name = self.__class__.__name__
        if hasattr(self, "obj"):
            obj_name = self.obj.__name__
            Event(
                Message(priority="debug",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Load definitions for namespace.%s."
                            "objects.%s.flows.%s" %
                            (self._ns.ns_src, obj_name, cls_name)
                        }))
            try:
                return self._ns.get_obj_flow_definition(obj_name, cls_name)
            except KeyError as ex:
                msg = "Could not find definitions for " \
                      "namespace.%s.objects.%s.flows.%s" % (self._ns.ns_src,
                                                            obj_name,
                                                            cls_name)
                Event(
                    ExceptionMessage(priority="debug",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message": "Error",
                                         "exception": ex
                                     }))
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": msg}))
                raise Exception(msg)
            finally:
                self.to_str = "%s.objects.%s.flows.%s" % (self._ns.ns_name,
                                                          obj_name, cls_name)

        else:
            Event(
                Message(priority="debug",
                        publisher=NS.publisher_id,
                        payload={
                            "message":
                            "Load definitions for namespace.%s."
                            "flows.%s" % (self._ns.ns_src, cls_name)
                        }))
            try:
                return self._ns.get_flow_definition(cls_name)
            except KeyError as ex:
                msg = "Could not find definitions for namespace.%s.flows.%s" %\
                      (self._ns.ns_src, cls_name)
                Event(
                    ExceptionMessage(priority="debug",
                                     publisher=NS.publisher_id,
                                     payload={
                                         "message": "Error",
                                         "exception": ex
                                     }))
                Event(
                    Message(priority="debug",
                            publisher=NS.publisher_id,
                            payload={"message": msg}))
                raise Exception(msg)
            finally:
                self.to_str = "%s.flows.%s" % (self._ns.ns_name, cls_name)