Exemplo n.º 1
0
    def run(self):
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )
        while not self._complete.is_set():
            _job_sync_interval = 5
            NS.node_context = NS.node_context.load()
            if "tendrl/monitor" in NS.node_context.tags:
                _job_sync_interval = 3

            time.sleep(_job_sync_interval)
            try:
                jobs = etcd_utils.read("/queue")
            except etcd.EtcdKeyNotFound:
                continue

            for job in jobs.leaves:
                # Check job not already locked by some agent
                jid = job.key.split('/')[-1]
                job_lock_key = "/queue/%s/locked_by" % jid
                try:
                    _locked_by = etcd_utils.read(job_lock_key).value
                    if _locked_by:
                        continue
                except etcd.EtcdKeyNotFound:
                    pass

                _job_thread = threading.Thread(
                    target=process_job, args=(jid)
                )
                _job_thread.daemon = True
                _job_thread.start()
                _job_thread.join()
def get_volumes_details(cluster_key):
    volume_details = []
    try:
        volume_list = utils.get_resource_keys(cluster_key, "Volumes")
        for volume_id in volume_list:
            deleted = etcd_utils.read(cluster_key + "/Volumes/" +
                                      str(volume_id) + "/" + "deleted").value
            if str(deleted).lower() != "true":
                volume_data = {}
                for attr in ATTRS["volumes"]:
                    volume_data[attr] = etcd_utils.read(cluster_key +
                                                        "/Volumes/" +
                                                        str(volume_id) + "/" +
                                                        attr).value
                subvolume_key = cluster_key + "/Volumes/" + str(volume_id)
                subvolume_details = get_subvolume_details(subvolume_key)
                volume_data["subvolume"] = subvolume_details
                volume_details.append(volume_data)
    except (KeyError, etcd.EtcdKeyNotFound) as ex:
        logger.log(
            "debug", NS.get("publisher_id", None), {
                'message':
                "Error while fetching "
                "volume id {}".format(volume_id) + str(ex)
            })
    return volume_details
Exemplo n.º 3
0
 def get_alert_destinations(self, key):
     email_ids = []
     email_notifications = etcd_utils.read(key)
     for email_notification in email_notifications.leaves:
         email = etcd_utils.read(email_notification.key).value
         email_ids.append(email)
     return email_ids
Exemplo n.º 4
0
 def exists(self):
     self.render()
     _exists = True
     try:
         etcd_utils.read("/{0}".format(self.value))
     except etcd.EtcdKeyNotFound:
         _exists = False
     return _exists
Exemplo n.º 5
0
 def exists(self):
     self.render()
     _exists = True
     try:
         etcd_utils.read("/{0}".format(self.value))
     except etcd.EtcdKeyNotFound:
         _exists = False
     return _exists
Exemplo n.º 6
0
    def run(self):
        node_ids = self.parameters.get('Node[]')
        if not node_ids or len(node_ids) == 0:
            raise AtomExecutionFailedError("Node[] cannot be empty")

        for node_id in node_ids:
            # Check if node has the OS details populated
            try:
                os_details = etcd_utils.read("nodes/%s/Os" % node_id)
                if os_details.leaves is None:
                    raise AtomExecutionFailedError(
                        "Node doesnt have OS details populated"
                    )
            except etcd.EtcdKeyNotFound:
                raise AtomExecutionFailedError(
                    "Node doesnt have OS details populated"
                )

            # Check if node has the CPU details populated
            try:
                cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id)
                if cpu_details.leaves is None:
                    raise AtomExecutionFailedError(
                        "Node doesnt have CPU details populated"
                    )
            except etcd.EtcdKeyNotFound:
                raise AtomExecutionFailedError(
                    "Node doesnt have CPU details populated"
                )

            # Check if node has the Memory populated
            try:
                memory_details = etcd_utils.read(
                    "nodes/%s/Memory" % node_id
                )
                if memory_details.leaves is None:
                    raise AtomExecutionFailedError(
                        "Node doesnt have Memory details populated"
                    )
            except etcd.EtcdKeyNotFound:
                raise AtomExecutionFailedError(
                    "Node doesnt have Memory details populated"
                )

            # Check if node has networks details populated
            try:
                networks = etcd_utils.read("nodes/%s/Networks" % node_id)
                if networks.leaves is None:
                    raise AtomExecutionFailedError(
                        "Node doesnt have network details populated"
                    )
            except etcd.EtcdKeyNotFound:
                raise AtomExecutionFailedError(
                    "Node doesnt have network details populated"
                )

        return True
Exemplo n.º 7
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']

        etcd_keys_to_delete = []
        etcd_keys_to_delete.append("/clusters/%s/nodes" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Bricks" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Volumes" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/GlobalDetails" %
                                   integration_id)
        etcd_keys_to_delete.append("/clusters/%s/TendrlContext" %
                                   integration_id)
        etcd_keys_to_delete.append("/clusters/%s/Utilization" % integration_id)
        etcd_keys_to_delete.append("/clusters/%s/raw_map" % integration_id)
        etcd_keys_to_delete.append("/alerting/clusters/%s" % integration_id)
        nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id)
        node_ids = []
        for node in nodes.leaves:
            node_id = node.key.split("/")[-1]
            node_ids.append(node_id)
            etcd_keys_to_delete.append("/alerting/nodes/%s" % node_id)

        # Find the alerting/alerts entries to be deleted
        try:
            cluster_alert_ids = etcd_utils.read("/alerting/clusters")
            for entry in cluster_alert_ids.leaves:
                ca_id = entry.key.split("/")[-1]
                etcd_keys_to_delete.append("/alerting/alerts/%s" % ca_id)
        except etcd.EtcdKeyNotFound:
            # No cluster alerts, continue
            pass

        try:
            node_alert_ids = etcd_utils.read("/alerting/nodes")
            for entry in node_alert_ids.leaves:
                na_id = entry.key.split("/")[-1]
                etcd_keys_to_delete.append("/alerting/alerts/%s" % na_id)
        except etcd.EtcdKeyNotFound:
            # No node alerts, continue
            pass

        # Remove the cluster details
        for key in list(set(etcd_keys_to_delete)):
            try:
                etcd_utils.delete(key, recursive=True)
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {"message": "%s key not found for deletion" % key},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )
                continue

        return True
Exemplo n.º 8
0
    def shutdown(signum, frame):
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Signal handler: stopping"}
        )
        # Remove the node's name from gluster server tag
        try:
            gl_srvr_list = etcd_utils.read(
                "/indexes/tags/gluster/server"
            ).value
            gl_srvr_list = json.loads(gl_srvr_list)
            if NS.node_context.node_id in gl_srvr_list:
                gl_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/gluster/server",
                json.dumps(gl_srvr_list)
            )
            node_tags = json.loads(NS.node_context.tags)
            if 'provisioner/%s' % NS.tendrl_context.integration_id \
                in node_tags:
                etcd_utils.delete(
                    "/indexes/tags/provisioner/%s" %
                    NS.tendrl_context.integration_id,
                    recursive=True
                )
            int_srvr_list = etcd_utils.read(
                "/indexes/tags/tendrl/integration/gluster"
            ).value
            int_srvr_list = json.loads(int_srvr_list)
            if NS.node_context.node_id in int_srvr_list:
                int_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/tendrl/integration/gluster",
                json.dumps(int_srvr_list)
            )
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Couldnt remove node from "
                    "gluster servers list tag."
                    "integration_id: %s, node_id: %s" %
                    (
                        NS.tendrl_context.integration_id,
                        NS.node_context.node_id
                    )
                }
            )
            pass

        complete.set()
        m.stop()
Exemplo n.º 9
0
    def shutdown(signum, frame):
        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "Signal handler: stopping"}
        )
        # Remove the node's name from gluster server tag
        try:
            gl_srvr_list = etcd_utils.read(
                "/indexes/tags/gluster/server"
            ).value
            gl_srvr_list = json.loads(gl_srvr_list)
            if NS.node_context.node_id in gl_srvr_list:
                gl_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/gluster/server",
                json.dumps(gl_srvr_list)
            )
            node_tags = NS.node_context.tags
            if 'provisioner/%s' % NS.tendrl_context.integration_id \
                in node_tags:
                etcd_utils.delete(
                    "/indexes/tags/provisioner/%s" %
                    NS.tendrl_context.integration_id,
                    recursive=True
                )
            int_srvr_list = etcd_utils.read(
                "/indexes/tags/tendrl/integration/gluster"
            ).value
            int_srvr_list = json.loads(int_srvr_list)
            if NS.node_context.node_id in int_srvr_list:
                int_srvr_list.remove(NS.node_context.node_id)
            etcd_utils.write(
                "/indexes/tags/tendrl/integration/gluster",
                json.dumps(int_srvr_list)
            )
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Couldnt remove node from "
                    "gluster servers list tag."
                    "integration_id: %s, node_id: %s" %
                    (
                        NS.tendrl_context.integration_id,
                        NS.node_context.node_id
                    )
                }
            )
            pass

        complete.set()
        m.stop()
Exemplo n.º 10
0
 def get_volume_details(self, objects, cluster_key):
     volume_detail = []
     volume_list = utils.get_resource_keys(cluster_key, "Volumes")
     for volume in volume_list:
         resource_detail = {}
         volume_key = os.path.join(cluster_key, "Volumes", volume)
         volume_deleted_key = os.path.join(volume_key, "deleted")
         try:
             is_volume_deleted = etcd_utils.read(volume_deleted_key).value
             if is_volume_deleted.lower() == "true":
                 continue
         except etcd.EtcdKeyNotFound:
             continue
         for key, value in objects["Volume"]["attrs"].items():
             if value is None:
                 try:
                     attr_key = os.path.join(volume_key, key)
                     attr_data = etcd_utils.read(attr_key)
                     attr_value = self.resource_status_mapper(
                         str(attr_data.value))
                     resource_detail[key] = attr_value
                 except (KeyError, etcd.EtcdKeyNotFound) as ex:
                     logger.log(
                         "debug", NS.get("publisher_id", None), {
                             'message':
                             "Cannot Find {0} in volume "
                             "{1}".format(key, volume) + str(ex)
                         })
             else:
                 try:
                     new_key = os.path.join(
                         volume_key,
                         objects["Volume"]["attrs"][key]["value"].rsplit(
                             "/", 1)[1])
                     resp_data = self.get_object_from_central_store(
                         new_key, objects["Volume"]["attrs"][key])
                     resource_detail[key] = resp_data
                 except (etcd.EtcdKeyNotFound, AttributeError,
                         KeyError) as ex:
                     resource_detail[key] = {
                         "total": 0,
                         "up": 0,
                         "down": 0,
                         "partial": 0,
                         "created": 0,
                         "stopped": 0,
                         "paused": 0
                     }
         if not resource_detail == {}:
             volume_detail.append(resource_detail)
     return volume_detail
Exemplo n.º 11
0
def find_volume_id(vol_name, integration_id):
    try:
        volumes = etcd_utils.read("clusters/%s/Volumes" % integration_id)
        for volume in volumes.leaves:
            key = volume.key + "/name"
            name = etcd_utils.read(key).value
            if vol_name == name:
                return volume.key.split("/")[-1]
    except (EtcdKeyNotFound) as ex:
        logger.log("error", NS.publisher_id, {
            "message":
            "Failed to fetch volume id for volume name %s" % vol_name
        })
        raise ex
Exemplo n.º 12
0
def run():
    try:
        clusters = etcd_utils.read("/clusters")
    except etcd.EtcdKeyNotFound:
        return

    # This logic only runs on node with tag `tendrl/monitor` (tendrl server)
    # So its done checked for all the clusters and tries to set cluster
    # status as `unhealthy` if the status field is missing (due to TTL)
    for cluster in clusters.leaves:
        int_id = cluster.key.split('/')[-1]
        fetched_cluster = NS.tendrl.objects.Cluster(
            integration_id=int_id
        ).load()
        try:
            if fetched_cluster and fetched_cluster.is_managed == "yes":
                NS._int.client.write(
                    "/clusters/{0}/GlobalDetails/status".format(
                        int_id
                    ),
                    "unhealthy",
                    prevExist=False
                )
        except etcd.EtcdAlreadyExist:
            pass

    return
Exemplo n.º 13
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']

        # Wait for /indexes/tags/tendrl/integration/$integration_id
        # to appear. This means cluster is import ready
        wait_count = 6
        loop_count = 0
        while True:
            try:
                integration_id_index_key = \
                    "indexes/tags/tendrl/integration/%s" % integration_id
                _node_ids = etcd_utils.read(
                    integration_id_index_key
                ).value
                if _node_ids:
                    return True
                if loop_count >= wait_count:
                    raise AtomExecutionFailedError(
                        "Cluster: %s is not yet marked as "
                        "import ready. Timing out." %
                        integration_id
                    )
            except etcd.EtcdKeyNotFound:
                time.sleep(5)
                loop_count += 1
                continue
        return True
Exemplo n.º 14
0
def sync_volume_connections(volumes):
    for volume in volumes:
        subvol_count = 0
        vol_connections = 0
        while True:
            try:
                subvol = etcd_utils.read(
                    "clusters/%s/Volumes/%s/Bricks/subvolume%s" %
                    (NS.tendrl_context.integration_id, volume.vol_id,
                     subvol_count))
                if subvol:
                    for entry in subvol.leaves:
                        brick_name = entry.key.split("/")[-1]
                        fetched_brick = NS.tendrl.objects.GlusterBrick(
                            NS.tendrl_context.integration_id,
                            brick_name.split(":")[0],
                            brick_name.split(":_")[-1]).load()
                        if fetched_brick and fetched_brick.client_count:
                            vol_connections += 0 \
                                if fetched_brick.client_count == '' \
                                else int(fetched_brick.client_count)
                    subvol_count += 1
            except etcd.EtcdKeyNotFound:
                break
        volume.client_count = vol_connections
        volume.save()
 def refresh_dashboard(self):
     try:
         # check alert organization is exist
         if NS.config.data["org_id"]:
             cluster_details = {}
             dashboards = []
             integration_ids = utils.get_resource_keys("", "clusters")
             for integration_id in integration_ids:
                 key = "/clusters/%s/TendrlContext/sds_name" % \
                     integration_id
                 sds_name = etcd_utils.read(key).value
                 if sds_name == constants.GLUSTER:
                     cluster_details, dashboards = gluster_cluster_details.\
                         get_cluster_details(
                             integration_id
                         )
                     cluster_details["sds_name"] = constants.GLUSTER
                 self.update_dashboard(cluster_details, dashboards)
         else:
             # try to create alert organization once again
             alert_organization.create()
     except (etcd.EtcdException, KeyError, AttributeError,
             req_excep.ConnectionError, TypeError,
             req_excep.RequestException,
             exceptions.ConnectionFailedException,
             exceptions.AlertOrganizationNotFound) as ex:
         logger.log(
             "debug", NS.get("publisher_id", None), {
                 'message':
                 "Failed to update cluster "
                 "dashboard.err: %s" % str(ex)
             })
Exemplo n.º 16
0
    def _run(self):
        aggregate_gluster_objects = NS.monitoring.definitions.get_parsed_defs(
        )["namespace.monitoring"]["graphite_data"]

        while not self._complete.is_set():
            if self.sync_interval is None:
                try:
                    interval = etcd_utils.read(
                        "_NS/gluster/config/data/sync_interval")
                    try:
                        self.sync_interval = float(interval.value)
                    except ValueError as ex:
                        logger.log(
                            "error", NS.get("publisher_id", None), {
                                'message':
                                "Unable to parse tendrl-gluster-integration config 'sync_interval' (value: %s)"
                                % interval.value
                            })
                        raise ex
                except etcd.EtcdKeyNotFound as ex:
                    continue

            try:
                gevent.sleep(self.sync_interval)
                cluster_details = self.plugin_obj.get_central_store_data(
                    aggregate_gluster_objects)
                metrics = graphite_utils.create_metrics(
                    aggregate_gluster_objects, cluster_details)
                for metric in metrics:
                    for key, value in metric.items():
                        if value:
                            respose = self.plugin_obj.push_metrics(key, value)
            except (etcd.EtcdKeyNotFound, AttributeError, KeyError) as ex:
                logger.log("error", NS.get("publisher_id", None),
                           {'message': str(ex)})
Exemplo n.º 17
0
    def load(self):
        self.render()
        if "Message" not in self.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store():
                return self

        _copy = self._copy_vars()
        # Check if self.value already set, use it
        if self.value.find('{') < 0:
            _copy.value = self.value
        key = _copy.value + '/data'
        try:
            val_str = etcd_utils.read(key).value
        except etcd.EtcdKeyNotFound:
            return self
        loc_dict = json.loads(val_str)
        for attr_name, attr_val in vars(_copy).iteritems():
            _type = self._defs.get("attrs", {}).get(attr_name, {}).get("type")
            if loc_dict.get(attr_name) in [None, ""]:
                if _type and _type.lower() == 'list':
                    setattr(_copy, attr_name, list())
                if _type and _type.lower() == 'json':
                    setattr(_copy, attr_name, dict())
            else:
                if _type and _type.lower() in ['list']:
                    setattr(_copy, attr_name, json.loads(loc_dict[attr_name]))
                else:
                    setattr(_copy, attr_name, loc_dict[attr_name])
        return _copy
Exemplo n.º 18
0
 def get_node_details(self, objects, integration_id):
     node_detail = []
     _cluster_node_ids = etcd_utils.read("/clusters/%s/nodes" %
                                         integration_id)
     for _node_id in _cluster_node_ids.leaves:
         _cnc = NS.tendrl.objects.ClusterNodeContext(
             integration_id=integration_id,
             node_id=_node_id.key.split('/')[-1]).load()
         if _cnc.is_managed != "yes":
             continue
         resource_detail = {}
         for key, value in objects["Node"]["attrs"].items():
             if value is None:
                 attr_value = getattr(_cnc, key)
                 if attr_value not in [None, ""]:
                     attr_value = self.resource_status_mapper(
                         str(getattr(_cnc, key)))
                     resource_detail[key] = attr_value
                 else:
                     if key == 'status':
                         _node_context = NS.tendrl.objects.NodeContext(
                             node_id=_cnc.node_id).load()
                         attr_value = self.resource_status_mapper(
                             str(getattr(_node_context, 'status')))
                         resource_detail[key] = attr_value
         node_detail.append(resource_detail)
     return node_detail
Exemplo n.º 19
0
def aggregate_session_status():
    volumes = NS.tendrl.objects.GlusterVolume(
        NS.tendrl_context.integration_id
    ).load_all()
    georep_status = GeoReplicationSessionStatus()
    if volumes:
        for volume in volumes:
            vol_id = volume.vol_id
            sessions = None
            try:
                sessions = etcd_utils.read(
                    "clusters/%s/Volumes/%s/GeoRepSessions" % (
                        NS.tendrl_context.integration_id,
                        vol_id
                    )
                )
            except etcd.EtcdKeyNotFound:
                continue
            pair_count = int(volume.brick_count)
            for session in sessions.leaves:
                session_status = None
                session_id = session.key.split("GeoRepSessions/")[-1]
                pairs = NS.gluster.objects.GeoReplicationPair(
                    vol_id=vol_id,
                    session_id=session_id
                ).load_all()
                faulty_count = 0
                stopped_count = 0
                paused_count = 0
                created_count = 0
                for pair in pairs:
                    if pair.status.lower() == "faulty":
                        faulty_count += 1
                    elif pair.status.lower() == "created":
                        created_count += 1
                    elif pair.status.lower() == "stopped":
                        stopped_count += 1
                    elif pair.status.lower() == "paused":
                        paused_count += 1
                if created_count == pair_count:
                    session_status = georep_status.CREATED
                elif faulty_count == 0 and (
                        stopped_count == 0 and paused_count == 0 and (
                        created_count == 0
                        )
                ):
                    session_status = georep_status.UP
                elif pair_count == faulty_count:
                    session_status = georep_status.DOWN
                elif stopped_count == pair_count:
                    session_status = georep_status.STOPPED
                elif paused_count == pair_count:
                    session_status = georep_status.PAUSED
                else:
                    session_status = georep_status.PARTIAL
                NS.tendrl.objects.GeoReplicationSession(
                    vol_id=vol_id,
                    session_id=session_id,
                    session_status=session_status
                ).save()
Exemplo n.º 20
0
 def get_cluster_details(self, objects, cluster_key):
     cluster_detail = []
     for obj in objects["Cluster"]:
         if obj in ["metric", "value"]:
             continue
         resource_detail = {}
         resource_detail[str(obj)] = {}
         obj_details = objects["Cluster"][str(obj)]
         obj_key = os.path.join(cluster_key, str(obj))
         obj_attrs = obj_details["attrs"]
         for key, _ in obj_attrs.items():
             try:
                 attr_key = os.path.join(obj_key, key)
                 attr_data = etcd_utils.read(attr_key)
                 attr_value = self.cluster_status_mapper(
                     str(attr_data.value))
                 resource_detail[str(obj)][key] = copy.deepcopy(attr_value)
             except (KeyError, etcd.EtcdKeyNotFound) as ex:
                 integration_id = cluster_key.split("/")[-1]
                 logger.log(
                     "debug", NS.get("publisher_id", None), {
                         'message':
                         "Cannot Find {0} in Cluster "
                         "{1}".format(key, integration_id) + str(ex)
                     })
         if not resource_detail == {}:
             cluster_detail.append(resource_detail)
     return cluster_detail
Exemplo n.º 21
0
def find_node_id(integration_id, fqdn):
    try:
        nodes = etcd_utils.read("clusters/%s/nodes" % integration_id)
        for node in nodes.leaves:
            node_id = node.key.split('/')[-1]
            node_context = NS.tendrl.objects.ClusterNodeContext()
            # formating value here because render populate integration_id
            # from namespace
            node_context.value = node_context.value.format(
                integration_id, node_id)
            if fqdn == node_context.load().fqdn:
                return node_id
        raise NodeNotFound
    except (EtcdKeyNotFound, NodeNotFound) as ex:
        if type(ex) != EtcdKeyNotFound:
            logger.log("error", NS.publisher_id,
                       {"message": "Failed to fetch fqdn for node %s" % fqdn})
        else:
            logger.log(
                "error", NS.publisher_id, {
                    "message":
                    "Node with fqdn %s not found "
                    "in cluster %s" % (fqdn, integration_id)
                })
        raise ex
Exemplo n.º 22
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       "UP",
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "INFO",
                                       node_id=self.node_id,
                                       integration_id=_tc.integration_id)
            del _cnc
Exemplo n.º 23
0
def find_node_id(integration_id, fqdn):
    _cluster_node_ids = etcd_utils.read("/clusters/%s/nodes" % integration_id)
    for _node_id in _cluster_node_ids.leaves:
        _cnc = NS.tendrl.objects.ClusterNodeContext(
            integration_id=integration_id,
            node_id=_node_id.key.split('/')[-1]).load()
        if _cnc.fqdn == fqdn:
            return _cnc.node_id
Exemplo n.º 24
0
def test_read():
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.client = importlib.import_module("tendrl.commons"
                                             ".tests.fixtures."
                                             "client").Client()
    NS._int.reconnect = type("Dummy", (object, ), {})
    with patch.object(Client, "read", return_value="test") as mock_read:
        obj = etcd_utils.read("key")
        assert obj == "test"
        assert mock_read.assert_called
    with patch.object(Client, "read", raise_etcdconnectionfailed) as mock_read:
        with pytest.raises(etcd.EtcdConnectionFailed):
            obj = etcd_utils.read("key")
    with patch.object(Client, "read", raise_etcdkeynotfound) as mock_read:
        with pytest.raises(etcd.EtcdKeyNotFound):
            obj = etcd_utils.read("key")
def get_cluster_details():
    ''' 
        To get details of glusters from etcd
        TODO: Optimize the code, reduce number of etcd calls 
        TODO: Extract etcd host and port from configuration file '''

    cluster_details_list = []
    try:
        result = etcd_utils.read('/clusters')
        for item in result.leaves:
            cluster_obj = cluster_detail.ClusterDetail()
            cluster_obj.integration_id = item.key.split('/')[-1]
            client_str = '/clusters/' + str(cluster_obj.integration_id)
            cluster_details = etcd_utils.read(client_str)
            for cluster in cluster_details.leaves:
                if 'Volumes' in cluster.key:
                    volumes = etcd_utils.read(client_str + "/Volumes")
                    for volume in volumes.leaves:
                        volume_id = volume.key.split('/')[-1]
                        volume_details = etcd_utils.read(client_str +
                                                         "/Volumes/" +
                                                         str(volume_id))
                        vol_dict = maps.NamedDict()
                        for vol in volume_details.leaves:
                            if "name" in vol.key:
                                vol_dict.volume_name = vol.value
                            if "Bricks" in vol.key:
                                subvolume_details = etcd_utils.read(
                                    client_str + "/Volumes/" + str(volume_id) +
                                    "/Bricks")
                                vol_dict.bricks = []
                                for subvolume in subvolume_details.leaves:
                                    brick_details = etcd_utils.read(
                                        client_str + "/Volumes/" +
                                        str(volume_id) + "/Bricks/" +
                                        str(subvolume.key.split('/')[-1]))
                                    for brick in brick_details.leaves:
                                        vol_dict.bricks.append(
                                            brick.key.split('/')[-1])
                        cluster_obj.volumes.append(vol_dict)
                if 'nodes' in cluster.key:
                    nodes = etcd_utils.read(client_str + "/nodes")
                    for node in nodes.leaves:
                        node_id = node.key.split('/')[-1]
                        node_details = etcd_utils.read(client_str + "/nodes/" +
                                                       str(node_id) +
                                                       "/NodeContext")
                        for row in node_details.leaves:
                            if "fqdn" in row.key:
                                cluster_obj.hosts.append(row.value)

            cluster_details_list.append(cluster_obj)
        return cluster_details_list
    except (etcd.EtcdKeyNotFound, KeyError) as ex:
        logger.log("error", NS.get("publisher_id", None), {'message': str(ex)})
        return None
Exemplo n.º 26
0
 def _enable_disable_volume_profiling(self):
     cluster = NS.tendrl.objects.Cluster(
         integration_id=NS.tendrl_context.integration_id).load()
     volumes = NS.gluster.objects.Volume().load_all() or []
     # Enable / disable based on cluster flag volume_profiling_flag
     # should be done only once while first sync. Later the volume
     # level volume_profiling_state should be set based on individual
     # volume level values
     first_sync_done = etcd_utils.read(
         "/clusters/%s/nodes/%s/NodeContext/first_sync_done" %
         (NS.tendrl_context.integration_id, NS.node_context.node_id)).value
     if first_sync_done in [None, "no", ""]:
         failed_vols = []
         if cluster.volume_profiling_flag == "enable":
             for volume in volumes:
                 if volume.profiling_enabled == "yes":
                     continue
                 out, err, rc = cmd_utils.Command(
                     "gluster volume profile %s start" % volume.name).run()
                 if (err or rc != 0) and \
                     "already started" in err:
                     failed_vols.append(volume.name)
             if len(failed_vols) > 0:
                 logger.log(
                     "debug", NS.publisher_id, {
                         "message":
                         "Profiling already "
                         "enabled for volumes: %s" % str(failed_vols)
                     })
             cluster.volume_profiling_state = "enabled"
         if cluster.volume_profiling_flag == "disable":
             for volume in volumes:
                 if volume.profiling_enabled == "no":
                     continue
                 out, err, rc = cmd_utils.Command(
                     "gluster volume profile %s stop" % volume.name).run()
                 if (err or rc != 0) and \
                     "not started" in err:
                     failed_vols.append(volume.name)
             if len(failed_vols) > 0:
                 logger.log(
                     "debug", NS.publisher_id, {
                         "message":
                         "Profiling not "
                         "enabled for volumes: %s" % str(failed_vols)
                     })
             cluster.volume_profiling_state = "disabled"
     profiling_enabled_count = 0
     for volume in volumes:
         if volume.profiling_enabled == "yes":
             profiling_enabled_count += 1
     if profiling_enabled_count == 0:
         cluster.volume_profiling_state = "disabled"
     elif profiling_enabled_count == len(volumes):
         cluster.volume_profiling_state = "enabled"
     elif profiling_enabled_count < len(volumes):
         cluster.volume_profiling_state = "mixed"
     cluster.save()
Exemplo n.º 27
0
 def get_object_from_central_store(self, resource_key, obj_attr):
     attr_details = etcd_utils.read(resource_key)
     resource_details = {"details": []}
     for attr_detail in attr_details.leaves:
         resource_detail = {}
         attr_key = attr_detail.key.rsplit("/", 1)[1]
         for key, value in obj_attr["attrs"].items():
             sub_attr = etcd_utils.read(
                 os.path.join(resource_key, attr_key, key))
             resource_detail[key] = sub_attr.value
         resource_details["details"].append(resource_detail)
     try:
         if obj_attr["count"]:
             resource_details = self.get_resource_count(
                 resource_details, obj_attr)
     except KeyError:
         pass
     return resource_details
Exemplo n.º 28
0
 def run(self):
     try:
         all_node_status_up = True
         # check job is parent or child
         job = NS.tendrl.objects.Job(
             job_id=self.parameters['job_id']).load()
         if "parent" not in job.payload:
             # fetch node id using integration_id
             integration_id = self.parameters[
                 'TendrlContext.integration_id']
             key = "indexes/tags/tendrl/integration/%s" % \
                 integration_id
             node_ids_str = etcd_utils.read(key).value
             node_ids = json.loads(node_ids_str)
             # identifying node status using node_id
             logger.log(
                 "info",
                 NS.publisher_id,
                 {"message": "Checking status of nodes %s" % str(node_ids)},
                 job_id=self.parameters['job_id'],
                 flow_id=self.parameters['flow_id'])
             nodes_up = []
             nodes_down = []
             for node in node_ids:
                 node = str(node)
                 # if node_context not found it will give status DOWN
                 node_context = NS.tendrl.objects.NodeContext(
                     node_id=node, status='DOWN').load()
                 if node_context.status == "UP":
                     nodes_up.append(node)
                 else:
                     all_node_status_up = False
                     nodes_down.append(node)
             if all_node_status_up:
                 logger.log("info",
                            NS.publisher_id,
                            {"message": "Nodes %s are up" % nodes_up},
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'])
             else:
                 logger.log("info",
                            NS.publisher_id,
                            {"message": "Nodes %s are down" % nodes_down},
                            job_id=self.parameters['job_id'],
                            flow_id=self.parameters['flow_id'])
         # no need to check for child job
         return all_node_status_up
     except (etcd.EtcdKeyNotFound, TypeError) as ex:
         logger.log(
             "error",
             NS.get("publisher_id", None), {
                 "message":
                 "Error checking status of nodes. Error: %s" % str(ex)
             },
             job_id=self.parameters['job_id'],
             flow_id=self.parameters['flow_id'])
         return False
Exemplo n.º 29
0
 def run(self):
     aggregate_gluster_objects = NS.monitoring.definitions.\
         get_parsed_defs()["namespace.monitoring"]["graphite_data"]
     _sleep = 0
     while not self._complete.is_set():
         # update monitoring tag in each sync
         NS.node_context = NS.node_context.load()
         current_tags = list(NS.node_context.tags)
         if "tendrl/integration/monitoring" not in current_tags:
             current_tags += ["tendrl/integration/monitoring"]
             NS.node_context.tags = list(set(current_tags))
             NS.node_context.save()
         if self.sync_interval is None:
             try:
                 config_data = json.loads(
                     etcd_utils.read("_NS/gluster/config/data").value)
                 try:
                     self.sync_interval = int(
                         config_data['data']['sync_interval'])
                 except ValueError as ex:
                     logger.log(
                         "error", NS.get("publisher_id", None), {
                             'message':
                             "Unable to parse tendrl-gluster-"
                             "integration config 'sync_interval'"
                         })
                     raise ex
             except etcd.EtcdKeyNotFound as ex:
                 # Before cluster import sync_interval is not populated
                 time.sleep(DEFAULT_SLEEP)
                 continue
         if _sleep > 5:
             _sleep = self.sync_interval
         else:
             _sleep += 1
         try:
             cluster_details = self.plugin_obj.get_central_store_data(
                 aggregate_gluster_objects)
             graphite_utils.create_cluster_alias(cluster_details)
             metrics = graphite_utils.create_metrics(
                 aggregate_gluster_objects, cluster_details)
             metric_list = []
             for metric in metrics:
                 for key, value in metric.items():
                     if value:
                         metric_list.append("tendrl.%s %s %d" %
                                            (key, value, int(time.time())))
             self.plugin_obj.push_metrics(metric_list)
             # Creating or refreshing alert dashboard
             if _sleep > 5:
                 SyncAlertDashboard().refresh_dashboard()
             time.sleep(_sleep)
         except (etcd.EtcdKeyNotFound, AttributeError, KeyError) as ex:
             logger.log("error", NS.get("publisher_id", None),
                        {'message': str(ex)})
             time.sleep(_sleep)
Exemplo n.º 30
0
def test_read():
    setattr(__builtin__, "NS", maps.NamedDict())
    setattr(NS, "_int", maps.NamedDict())
    NS._int.client = importlib.import_module("tendrl.commons"
                                             ".tests.fixtures."
                                             "client").Client()
    NS._int.reconnect = type("Dummy", (object,), {})
    with patch.object(Client, "read",
                      return_value="test") as mock_read:
        obj = etcd_utils.read("key")
        assert obj == "test"
        assert mock_read.assert_called
    with patch.object(Client, "read",
                      raise_etcdconnectionfailed) as mock_read:
        with pytest.raises(etcd.EtcdConnectionFailed):
            obj = etcd_utils.read("key")
    with patch.object(Client,
                      "read", raise_etcdkeynotfound) as mock_read:
        with pytest.raises(etcd.EtcdKeyNotFound):
            obj = etcd_utils.read("key")
Exemplo n.º 31
0
def run():
    try:
        nodes = etcd_utils.read("/nodes")
        for node in nodes.leaves:
            node_id = node.key.split("/")[-1]
            _node_context = NS.tendrl.objects.NodeContext(
                node_id=node_id
            ).load()
            if _node_context.fqdn:
                _node_context.watch_attrs()
    except etcd.EtcdKeyNotFound:
        pass
    return
Exemplo n.º 32
0
 def _sync_cluster_network_details(self):
     try:
         etcd_utils.read("clusters/%s/cluster_network" %
                         NS.tendrl_context.integration_id)
     except etcd.EtcdKeyNotFound:
         try:
             cluster_config = NS.ceph.objects.SyncObject(
                 sync_type='config').load().data
             cluster = NS.tendrl.objects.Cluster(
                 integration_id=NS.tendrl_context.integration_id).load()
             cluster.public_network = cluster_config['public_network']
             cluster.cluster_network = cluster_config['cluster_network']
             cluster.save()
         except etcd.EtcdKeyNotFound as ex:
             Event(
                 Message(priority="error",
                         publisher=NS.publisher_id,
                         payload={
                             'message':
                             "Failed to sync cluster network details"
                         }))
             raise ex
Exemplo n.º 33
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status":
            if current_value is None:
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event("node_status",
                                       self.status,
                                       msg,
                                       "node_{0}".format(self.fqdn),
                                       "WARNING",
                                       node_id=self.node_id)

                _tc = NS.tendrl.objects.TendrlContext(
                    node_id=self.node_id).load()
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                    _msg = "node_sync, STALE provisioner node "\
                        "found! re-configuring monitoring "\
                        "(job-id: %s) on this node"
                    payload = {
                        "tags": ["tendrl/node_%s" % self.node_id],
                        "run": "tendrl.flows.ConfigureMonitoring",
                        "status": "new",
                        "parameters": {
                            'TendrlContext.integration_id': _tc.integration_id
                        },
                        "type": "node"
                    }
                    _job_id = str(uuid.uuid4())
                    NS.tendrl.objects.Job(job_id=_job_id,
                                          status="new",
                                          payload=payload).save()
                    logger.log("debug", NS.publisher_id,
                               {"message": _msg % _job_id})

                if _tc.sds_name == "gluster":
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id, self.fqdn))

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write("{0}/status".format(brick.key),
                                             "Stopped")
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
Exemplo n.º 34
0
def find_volume_name(integration_id, hostname, brick_path):
    try:
        vol_name = etcd_utils.read(
            "clusters/%s/Bricks/all/%s/%s/vol_name" %
            (integration_id, hostname, brick_path)).value
        return vol_name
    except EtcdKeyNotFound as ex:
        logger.log(
            "debug", NS.publisher_id, {
                "message":
                "Unable to find volume name for brick"
                " %s:%s" % (hostname, brick_path)
            })
        raise ex
Exemplo n.º 35
0
def aggregate_session_status():
    volumes = NS.tendrl.objects.GlusterVolume(
        NS.tendrl_context.integration_id).load_all()
    georep_status = GeoReplicationSessionStatus()
    if volumes:
        for volume in volumes:
            vol_id = volume.vol_id
            sessions = None
            try:
                sessions = etcd_utils.read(
                    "clusters/%s/Volumes/%s/GeoRepSessions" %
                    (NS.tendrl_context.integration_id, vol_id))
            except etcd.EtcdKeyNotFound:
                continue
            pair_count = int(volume.brick_count)
            for session in sessions.leaves:
                session_status = None
                session_id = session.key.split("GeoRepSessions/")[-1]
                pairs = NS.gluster.objects.GeoReplicationPair(
                    vol_id=vol_id, session_id=session_id).load_all()
                faulty_count = 0
                stopped_count = 0
                paused_count = 0
                created_count = 0
                for pair in pairs:
                    if pair.status.lower() == "faulty":
                        faulty_count += 1
                    elif pair.status.lower() == "created":
                        created_count += 1
                    elif pair.status.lower() == "stopped":
                        stopped_count += 1
                    elif pair.status.lower() == "paused":
                        paused_count += 1
                if created_count == pair_count:
                    session_status = georep_status.CREATED
                elif faulty_count == 0 and (stopped_count == 0 and paused_count
                                            == 0 and created_count == 0):
                    session_status = georep_status.UP
                elif pair_count == faulty_count:
                    session_status = georep_status.DOWN
                elif stopped_count == pair_count:
                    session_status = georep_status.STOPPED
                elif paused_count == pair_count:
                    session_status = georep_status.PAUSED
                else:
                    session_status = georep_status.PARTIAL
                NS.tendrl.objects.GeoReplicationSession(
                    vol_id=vol_id,
                    session_id=session_id,
                    session_status=session_status).save()
Exemplo n.º 36
0
    def load_all(self):
        ins = []
        try:
            self.render()
            value = '/'.join(self.value.split('/')[:-1])
            etcd_resp = etcd_utils.read(value)

            for item in etcd_resp.leaves:
                # When directory is not empty then NS._int.client.read(key)
                # will return key + directory id as new key. If directory is
                # empty then it will return key only. When directory is
                # not present then it will raise EtcdKeyNotFound
                if item.key.strip("/") != value.strip("/"):
                    # if dir is empty then item.key and value is same
                    self.value = item.key
                    ins.append(self.load())
        except etcd.EtcdKeyNotFound:
            pass
        return ins
Exemplo n.º 37
0
 def hash_compare_with_central_store(self, ttl=None):
     try:
         # Generate current in memory object hash
         self.hash = self._hash()
         _hash_key = "/{0}/hash".format(self.value)
         _stored_hash = None
         try:
             _stored_hash = etcd_utils.read(_hash_key).value
         except etcd.EtcdKeyNotFound:
             return False
         if self.hash == _stored_hash:
             # No changes in stored object and current object,
             # dont save current object to central store
             if ttl:
                 etcd_utils.refresh(self.value, ttl)
             return True
         else:
             return False
     except TypeError:
         # no hash for this object, save the current hash as is
         return False
Exemplo n.º 38
0
    def load(self):
        self.render()
        _copy = self._copy_vars()
        # Check if self.value already set, use it
        if self.value.find('{') < 0:
            _copy.value = self.value
        if "Message" not in _copy.__class__.__name__:
            # If local object.hash is equal to
            # central_store object.hash, return
            if self.hash_compare_with_central_store():
                return _copy

        key = _copy.value + '/data'
        try:
            val_str = etcd_utils.read(key).value
        except etcd.EtcdKeyNotFound:
            return _copy
        loc_dict = json.loads(val_str)
        for attr_name, attr_val in vars(_copy).iteritems():
            _type = self._defs.get("attrs", {}).get(
                attr_name,
                {}
            ).get("type")
            if loc_dict.get(attr_name) in [None, ""]:
                if _type and _type.lower() == 'list':
                    setattr(_copy, attr_name, list())
                if _type and _type.lower() == 'json':
                    setattr(_copy, attr_name, dict())
            else:
                if _type and _type.lower() in ['list']:
                    setattr(
                        _copy,
                        attr_name,
                        json.loads(loc_dict[attr_name])
                    )
                else:
                    setattr(_copy, attr_name, loc_dict[attr_name])
        return _copy
Exemplo n.º 39
0
    def __init__(self, node_id=None, fqdn=None, ipv4_addr=None,
                 tags=None, status=None, sync_status=None,
                 last_sync=None, pkey=None,
                 locked_by=None, *args, **kwargs):
        super(NodeContext, self).__init__(*args, **kwargs)
        self.node_id = node_id or self._get_node_id() or self._create_node_id()
        self.fqdn = fqdn
        self.ipv4_addr = ipv4_addr
        if self.fqdn:
            self.ipv4_addr = socket.gethostbyname(self.fqdn)
        self.locked_by = locked_by

        curr_tags = []
        try:
            _nc_data = etcd_utils.read(
                "/nodes/%s/NodeContext/data" % self.node_id
            ).value
            curr_tags = json.loads(_nc_data)['tags']
        except etcd.EtcdKeyNotFound:
            pass

        try:
            curr_tags = json.loads(curr_tags)
        except (ValueError, TypeError):
            # No existing tags
            pass
        self.tags = tags or []
        self.tags += NS.config.data.get('tags', [])
        self.tags += curr_tags
        self.tags = list(set(self.tags))

        self.status = status or "UP"
        self.sync_status = sync_status
        self.last_sync = last_sync
        self.pkey = pkey or self.fqdn
        self.value = 'nodes/{0}/NodeContext'
Exemplo n.º 40
0
    def volume_remove_brick_force(self, event):
        time.sleep(self.sync_interval)
        # Event returns bricks list as space separated single string
        bricks = event['message']['bricks'].split(" ")
        try:
            for brick in bricks:
                # find fqdn using ip
                ip = socket.gethostbyname(brick.split(":/")[0])
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                brick = fqdn + ":" + brick.split(":")[-1]
                fetched_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    fqdn=brick.split(":/")[0],
                    brick_dir=brick.split(":/")[1].replace('/', '_')
                ).load()

                # delete brick
                etcd_utils.delete(
                    "clusters/{0}/Bricks/all/{1}/{2}".format(
                        NS.tendrl_context.integration_id,
                        brick.split(":/")[0],
                        brick.split(":/")[1].replace('/', '_')
                    ),
                    recursive=True,
                )

                # delete alert dashbaord
                job_id = monitoring_utils.update_dashboard(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK,
                    NS.tendrl_context.integration_id,
                    "delete"
                )
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message": "Update dashboard job %s "
                        "created" % job_id
                    }
                )

                # delete brick details from graphite
                job_id = monitoring_utils.delete_resource_from_graphite(
                    "%s|%s" % (event['message']['volume'], brick),
                    RESOURCE_TYPE_BRICK,
                    NS.tendrl_context.integration_id,
                    "delete"
                )
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message": "Delete resource from graphite job %s "
                        "created" % job_id
                    }
                )

            volume_brick_path = "clusters/{0}/Volumes/{1}/"\
                                "Bricks".format(
                                    NS.tendrl_context.integration_id,
                                    fetched_brick.vol_id,
                                )

            # remove all the brick infromation under volume as the
            # subvolume might have changed, let the next sync handle
            # the updation of brick info
            etcd_utils.delete(
                volume_brick_path,
                recursive=True
            )

            _trigger_sync_key = 'clusters/%s/_sync_now' % \
                NS.tendrl_context.integration_id
            etcd_utils.write(_trigger_sync_key, 'true')
            etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
        except etcd.EtcdKeyNotFound:
            logger.log(
                "debug",
                NS.publisher_id,
                {
                    "message": "Unable to delete bricks %s" % bricks
                }
            )
Exemplo n.º 41
0
def sync_volumes(
    volumes, index,
    vol_options,
    sync_ttl,
    cluster_short_name,
    devicetree,
    lvs
):
    NS.node_context = NS.tendrl.objects.NodeContext().load()
    tag_list = NS.node_context.tags
    # Raise alerts for volume state change.
    cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id
    if cluster_provisioner in tag_list:
        try:
            _volume = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            if _volume.locked_by and 'job_id' in _volume.locked_by and \
                _volume.current_job.get('status', '') == 'in_progress':
                # There is a job active on volume. skip the sync
                return
            stored_volume_status = _volume.status
            current_status = volumes['volume%s.status' % index]
            if stored_volume_status not in [None, ""] and \
                current_status != stored_volume_status:
                msg = ("Status of volume: %s in cluster %s "
                       "changed from %s to %s") % (
                           volumes['volume%s.name' % index],
                           cluster_short_name,
                           stored_volume_status,
                           current_status)
                instance = "volume_%s" % volumes[
                    'volume%s.name' % index
                ]
                event_utils.emit_event(
                    "volume_status",
                    current_status,
                    msg,
                    instance,
                    'WARNING' if current_status == 'Stopped'
                    else 'INFO',
                    tags={"entity_type": RESOURCE_TYPE_VOLUME,
                          "volume_name": volumes['volume%s.name' % index]
                          }
                )
        except (KeyError, etcd.EtcdKeyNotFound) as ex:
            if isinstance(ex, KeyError):
                raise ex
            pass

        volume = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).load()
        volume.vol_type = "arbiter" \
            if int(volumes['volume%s.arbiter_count' % index]) > 0 \
            else volumes['volume%s.type' % index]
        volume.name = volumes['volume%s.name' % index]
        volume.transport_type = volumes['volume%s.transport_type' % index]
        volume.status = volumes['volume%s.status' % index]
        volume.brick_count = volumes['volume%s.brickcount' % index]
        volume.snap_count = volumes['volume%s.snap_count' % index]
        volume.stripe_count = volumes['volume%s.stripe_count' % index]
        volume.replica_count = volumes['volume%s.replica_count' % index]
        volume.subvol_count = volumes['volume%s.subvol_count' % index]
        volume.arbiter_count = volumes['volume%s.arbiter_count' % index]
        volume.disperse_count = volumes['volume%s.disperse_count' % index]
        volume.redundancy_count = volumes['volume%s.redundancy_count' % index]
        volume.quorum_status = volumes['volume%s.quorum_status' % index]
        volume.snapd_status = volumes[
            'volume%s.snapd_svc.online_status' % index]
        volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index]
        if NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id,
            vol_id=volumes['volume%s.id' % index]
        ).exists():
            existing_vol = NS.tendrl.objects.GlusterVolume(
                NS.tendrl_context.integration_id,
                vol_id=volumes['volume%s.id' % index]
            ).load()
            volume_profiling_old_value = existing_vol.profiling_enabled
        else:
            volume_profiling_old_value = volume.profiling_enabled
        if ('volume%s.profile_enabled' % index) in volumes:
            value = int(volumes['volume%s.profile_enabled' % index])
            if value == 1:
                volume_profiling_new_value = "yes"
            else:
                volume_profiling_new_value = "no"
        else:
            volume_profiling_new_value = None
        volume.profiling_enabled = volume_profiling_new_value
        if volume_profiling_old_value not in [None, ""] and \
            volume_profiling_old_value != volume_profiling_new_value:
            # Raise alert for the same value change
            msg = ("Value of volume profiling for volume: %s "
                   "of cluster %s changed from %s to %s" % (
                       volumes['volume%s.name' % index],
                       cluster_short_name,
                       volume_profiling_old_value,
                       volume_profiling_new_value))
            instance = "volume_%s" % \
                volumes['volume%s.name' % index]
            event_utils.emit_event(
                "volume_profiling_status",
                volume_profiling_new_value,
                msg,
                instance,
                'INFO',
                tags={
                    "entity_type": RESOURCE_TYPE_BRICK,
                    "volume_name": volumes[
                        'volume%s.name' % index
                    ]
                }
            )
        volume.save(ttl=sync_ttl)
        # Save the default values of volume options
        vol_opt_dict = {}
        for opt_count in \
            range(1, int(vol_options['volume%s.options.count' % index])):
            vol_opt_dict[
                vol_options[
                    'volume%s.options.key%s' % (index, opt_count)
                ]
            ] = vol_options[
                'volume%s.options.value%s' % (index, opt_count)
            ]
        volume.options = vol_opt_dict
        volume.save()

    rebal_det = NS.gluster.objects.RebalanceDetails(
        vol_id=volumes['volume%s.id' % index],
        rebal_id=volumes['volume%s.rebalance.id' % index],
        rebal_status=volumes['volume%s.rebalance.status' % index],
        rebal_failures=volumes['volume%s.rebalance.failures' % index],
        rebal_skipped=volumes['volume%s.rebalance.skipped' % index],
        rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index],
        rebal_files=volumes['volume%s.rebalance.files' % index],
        rebal_data=volumes['volume%s.rebalance.data' % index],
        time_left=volumes.get('volume%s.rebalance.time_left' % index),
    )
    rebal_det.save(ttl=sync_ttl)
    georep_details.save_georep_details(volumes, index)

    b_index = 1
    # ipv4 address of current node
    try:
        network_ip = []
        networks = NS.tendrl.objects.NodeNetwork().load_all()
        for network in networks:
            if network.ipv4:
                network_ip.extend(network.ipv4)
    except etcd.EtcdKeyNotFound as ex:
        Event(
            ExceptionMessage(
                priority="debug",
                publisher=NS.publisher_id,
                payload={
                    "message": "Could not find "
                    "any ipv4 networks for node"
                    " %s" % NS.node_context.node_id,
                    "exception": ex
                }
            )
        )
    while True:
        try:
            # Update brick node wise
            hostname = volumes[
                'volume%s.brick%s.hostname' % (index, b_index)
            ]
            ip = socket.gethostbyname(hostname)
            try:
                node_id = etcd_utils.read("indexes/ip/%s" % ip).value
                fqdn = NS.tendrl.objects.ClusterNodeContext(
                    node_id=node_id
                ).load().fqdn
                cluster_node_ids = etcd_utils.read(
                    "indexes/tags/tendrl/integration/%s" %
                    NS.tendrl_context.integration_id
                ).value
                cluster_node_ids = json.loads(cluster_node_ids)
                if NS.node_context.fqdn != fqdn or \
                        node_id not in cluster_node_ids:
                    b_index += 1
                    continue
            except(TypeError, etcd.EtcdKeyNotFound):
                b_index += 1
                continue
            sub_vol_size = (int(
                volumes['volume%s.brickcount' % index]
            )) / int(
                volumes['volume%s.subvol_count' % index]
            )
            brick_name = NS.node_context.fqdn
            brick_name += ":"
            brick_name += volumes['volume%s.brick%s' '.path' % (
                index,
                b_index
            )].split(":")[-1].replace("/", "_")

            # Raise alerts if the brick path changes
            try:
                stored_brick = NS.tendrl.objects.GlusterBrick(
                    NS.tendrl_context.integration_id,
                    NS.node_context.fqdn,
                    brick_dir=brick_name.split(":_")[-1]
                ).load()
                current_status = volumes.get(
                    'volume%s.brick%s.status' % (index, b_index)
                )
                if stored_brick.status and \
                    current_status != stored_brick.status:
                    msg = ("Brick:%s in volume:%s has %s"
                           ) % (
                               volumes['volume%s.brick%s' '.path' % (
                                   index,
                                   b_index
                               )],
                               volumes['volume%s.' 'name' % index],
                               current_status)
                    instance = "volume_%s|brick_%s" % (
                        volumes['volume%s.name' % index],
                        volumes['volume%s.brick%s.path' % (
                            index,
                            b_index
                        )]
                    )
                    event_utils.emit_event(
                        "brick_status",
                        current_status,
                        msg,
                        instance,
                        'WARNING' if current_status == 'Stopped'
                        else 'INFO',
                        tags={"entity_type": RESOURCE_TYPE_BRICK,
                              "volume_name": volumes[
                                  'volume%s.' 'name' % index]
                              }
                    )

            except etcd.EtcdKeyNotFound:
                pass

            brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s"

            vol_brick_path = brk_pth % (
                NS.tendrl_context.integration_id,
                volumes['volume%s.id' % index],
                str((b_index - 1) / sub_vol_size),
                brick_name
            )

            etcd_utils.write(vol_brick_path, "")
            brick = NS.tendrl.objects.GlusterBrick(
                NS.tendrl_context.integration_id,
                NS.node_context.fqdn,
                brick_dir=brick_name.split(":_")[-1]
            ).load()
            brick.integration_id = NS.tendrl_context.integration_id
            brick.fqdn = NS.node_context.fqdn
            brick.brick_dir = brick_name.split(":_")[-1]
            brick.name = brick_name
            brick.vol_id = volumes['volume%s.id' % index]
            brick.sequence_number = b_index
            brick.brick_path = volumes[
                'volume%s.brick%s.path' % (index, b_index)
            ]
            brick.hostname = volumes.get(
                'volume%s.brick%s.hostname' % (index, b_index)
            )
            brick.port = volumes.get(
                'volume%s.brick%s.port' % (index, b_index)
            )
            brick.vol_name = volumes['volume%s.name' % index]
            brick.used = True
            brick.node_id = NS.node_context.node_id
            brick.status = volumes.get(
                'volume%s.brick%s.status' % (index, b_index)
            )
            brick.filesystem_type = volumes.get(
                'volume%s.brick%s.filesystem_type' % (index, b_index)
            )
            brick.mount_opts = volumes.get(
                'volume%s.brick%s.mount_options' % (index, b_index)
            )
            brick.utilization = brick_utilization.brick_utilization(
                volumes['volume%s.brick%s.path' % (index, b_index)],
                lvs
            )
            brick.client_count = volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            )
            brick.is_arbiter = volumes.get(
                'volume%s.brick%s.is_arbiter' % (index, b_index)
            )
            brick.save(ttl=sync_ttl)
            # sync brick device details
            brick_device_details.\
                update_brick_device_details(
                    brick_name,
                    volumes[
                        'volume%s.brick%s.path' % (
                            index, b_index)
                    ],
                    devicetree,
                    sync_ttl
                )

            # Sync the brick client details
            c_index = 1
            if volumes.get(
                'volume%s.brick%s.client_count' % (index, b_index)
            ) > 0:
                while True:
                    try:
                        NS.gluster.objects.ClientConnection(
                            brick_name=brick_name,
                            fqdn=NS.node_context.fqdn,
                            brick_dir=brick_name.split(":_")[-1],
                            hostname=volumes[
                                'volume%s.brick%s.client%s.hostname' % (
                                    index, b_index, c_index
                                )
                            ],
                            bytesread=volumes[
                                'volume%s.brick%s.client%s.bytesread' % (
                                    index, b_index, c_index
                                )
                            ],
                            byteswrite=volumes[
                                'volume%s.brick%s.client%s.byteswrite' % (
                                    index, b_index, c_index
                                )
                            ],
                            opversion=volumes[
                                'volume%s.brick%s.client%s.opversion' % (
                                    index, b_index, c_index
                                )
                            ]
                        ).save(ttl=sync_ttl)
                    except KeyError:
                        break
                    c_index += 1
            sync_ttl += 4
            b_index += 1
        except KeyError:
            break
    return b_index
Exemplo n.º 42
0
 def update_cluster_details(self, integration_id):
     try:
         nodes = etcd_utils.read(
             "/clusters/%s/nodes" % integration_id
         )
         for node in nodes.leaves:
             _cnc = NS.tendrl.objects.ClusterNodeContext(
                 node_id=node.key.split("/")[-1],
                 integration_id=integration_id
             ).load()
             # Verify all nodes in a cluster are down
             if str(_cnc.status).lower() != "down" and \
                     str(_cnc.is_managed).lower() == "yes":
                 # Any one managed node not down don't update
                 # cluster details, No need to consider unmanaged
                 # nodes
                 return
         # when all managed nodes are down update cluster details
         global_details = NS.tendrl.objects.GlobalDetails(
             integration_id=integration_id
         ).load()
         # Update cluster as unhealthy
         if global_details.status.lower() == "healthy":
             global_details.status = "unhealthy"
             global_details.save()
             _cluster = NS.tendrl.objects.Cluster(
                 integration_id=integration_id
             ).load()
             msg = "Cluster:%s is %s" % (
                 _cluster.short_name, "unhealthy")
             instance = "cluster_%s" % integration_id
             event_utils.emit_event(
                 "cluster_health_status",
                 "unhealthy",
                 msg,
                 instance,
                 'WARNING',
                 integration_id=integration_id
             )
         # Update all bricks are down
         nodes = etcd_utils.read(
             "/clusters/%s/Bricks/all" % integration_id
         )
         for node in nodes.leaves:
             bricks = NS.tendrl.objects.GlusterBrick(
                 integration_id,
                 fqdn=node.key.split("/")[-1]
             ).load_all()
             for brick in bricks:
                 if brick.status.lower() != "stopped":
                     brick.status = "Stopped"
                     brick.save()
                     msg = ("Brick:%s in volume:%s has %s") % (
                         brick.brick_path,
                         brick.vol_name,
                         "Stopped"
                     )
                     instance = "volume_%s|brick_%s" % (
                         brick.vol_name,
                         brick.brick_path
                     )
                     event_utils.emit_event(
                         "brick_status",
                         "Stopped",
                         msg,
                         instance,
                         "WARNING",
                         integration_id=integration_id,
                         tags={"entity_type": "brick",
                               "volume_name": brick.vol_name,
                               "node_id": brick.node_id
                               }
                     )
         # Update all volumes are down
         volumes = NS.tendrl.objects.GlusterVolume(
             integration_id
         ).load_all()
         for volume in volumes:
             if volume.state.lower() != "down":
                 volume.state = "down"
                 volume.status = "Stopped"
                 volume.save()
                 msg = "Volume:%s is %s" % (volume.name, "down")
                 instance = "volume_%s" % volume.name
                 event_utils.emit_event(
                     "volume_state",
                     "down",
                     msg,
                     instance,
                     "WARNING",
                     integration_id=integration_id,
                     tags={"entity_type": "volume",
                           "volume_name": volume.name
                           }
                 )
     except etcd.EtcdKeyNotFound:
         pass
Exemplo n.º 43
0
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {"message": "%s running" % self.__class__.__name__}
        )

        gluster_brick_dir = NS.gluster.objects.GlusterBrickDir()
        gluster_brick_dir.save()

        cluster = NS.tendrl.objects.Cluster(
            integration_id=NS.tendrl_context.integration_id
        ).load()
        if cluster.cluster_network in [None, ""]:
            try:
                node_networks = NS.tendrl.objects.NodeNetwork().load_all()
                cluster.cluster_network = node_networks[0].subnet
                cluster.save()
            except etcd.EtcdKeyNotFound as ex:
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": "Failed to sync cluster network details"}
                )
        _sleep = 0
        while not self._complete.is_set():
            # To detect out of band deletes
            # refresh gluster object inventory at config['sync_interval']
            SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100
            NS.node_context = NS.node_context.load()
            NS.tendrl_context = NS.tendrl_context.load()
            if _sleep > 5:
                _sleep = int(NS.config.data.get("sync_interval", 10))
            else:
                _sleep += 1

            try:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if (_cluster.status == "importing" and (
                    _cluster.current_job['status'] == 'failed')) or \
                    _cluster.status == "unmanaging" or \
                    _cluster.status == "set_volume_profiling":
                    time.sleep(_sleep)
                    continue

                _cnc = NS.tendrl.objects.ClusterNodeContext(
                    node_id=NS.node_context.node_id
                ).load()
                _cnc.is_managed = "yes"
                _cnc.save()
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state',
                        'detail'
                    ]
                )
                raw_data = ini2json.ini_to_dict(
                    '/var/run/glusterd-state'
                )
                subprocess.call(['rm', '-rf', '/var/run/glusterd-state'])
                subprocess.call(
                    [
                        'gluster',
                        'get-state',
                        'glusterd',
                        'odir',
                        '/var/run',
                        'file',
                        'glusterd-state-vol-opts',
                        'volumeoptions'
                    ]
                )
                raw_data_options = ini2json.ini_to_dict(
                    '/var/run/glusterd-state-vol-opts'
                )
                subprocess.call(
                    [
                        'rm',
                        '-rf',
                        '/var/run/glusterd-state-vol-opts'
                    ]
                )
                sync_object = NS.gluster.objects.\
                    SyncObject(data=json.dumps(raw_data))
                sync_object.save()

                if "Peers" in raw_data:
                    index = 1
                    peers = raw_data["Peers"]
                    disconnected_hosts = []
                    while True:
                        try:
                            peer = NS.tendrl.\
                                objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    hostname=peers[
                                        'peer%s.primary_hostname' % index
                                    ],
                                    state=peers['peer%s.state' % index],
                                    connected=peers['peer%s.connected' % index]
                                )
                            try:
                                stored_peer_status = None
                                # find peer detail using hostname
                                ip = socket.gethostbyname(
                                    peers['peer%s.primary_hostname' % index]
                                )
                                node_id = etcd_utils.read(
                                    "/indexes/ip/%s" % ip
                                ).value
                                stored_peer = NS.tendrl.objects.GlusterPeer(
                                    peer_uuid=peers['peer%s.uuid' % index],
                                    node_id=node_id
                                ).load()
                                stored_peer_status = stored_peer.connected
                                current_status = peers[
                                    'peer%s.connected' % index
                                ]
                                if stored_peer_status and \
                                    current_status != stored_peer_status:
                                    msg = (
                                        "Peer %s in cluster %s "
                                        "is %s"
                                    ) % (
                                        peers[
                                            'peer%s.primary_hostname' %
                                            index
                                        ],
                                        _cluster.short_name,
                                        current_status
                                    )
                                    instance = "peer_%s" % peers[
                                        'peer%s.primary_hostname' % index
                                    ]
                                    event_utils.emit_event(
                                        "peer_status",
                                        current_status,
                                        msg,
                                        instance,
                                        'WARNING'
                                        if current_status != 'Connected'
                                        else 'INFO'
                                    )
                                    # save current status in actual peer
                                    # directory also
                                    stored_peer.connected = current_status
                                    stored_peer.save()
                                    # Disconnected host name to
                                    # raise brick alert
                                    if current_status.lower() == \
                                        "disconnected":
                                        disconnected_hosts.append(
                                            peers[
                                                'peer%s.primary_hostname' %
                                                index
                                            ]
                                        )
                            except etcd.EtcdKeyNotFound:
                                pass
                            SYNC_TTL += 5
                            peer.save(ttl=SYNC_TTL)
                            index += 1
                        except KeyError:
                            break
                    # Raise an alert for bricks when peer disconnected
                    # or node goes down
                    for disconnected_host in disconnected_hosts:
                        brick_status_alert(
                            disconnected_host
                        )
                if "Volumes" in raw_data:
                    # create devicetree using lsblk
                    devicetree = get_device_tree()
                    # find lvs
                    lvs = brick_utilization.get_lvs()
                    index = 1
                    volumes = raw_data['Volumes']
                    total_brick_count = 0
                    while True:
                        try:
                            b_count = sync_volumes(
                                volumes, index,
                                raw_data_options.get('Volume Options'),
                                SYNC_TTL + VOLUME_TTL,
                                _cluster.short_name,
                                devicetree,
                                lvs
                            )
                            index += 1
                            SYNC_TTL += 1
                            total_brick_count += b_count - 1
                        except KeyError:
                            global VOLUME_TTL
                            # from second sync volume ttl is
                            # SYNC_TTL + (no.volumes) * 20 +
                            # (no.of.bricks) * 10 + 160
                            if index > 1:
                                volume_count = index - 1
                                # When all nodes are down we are updating all
                                # volumes are down, node status TTL is 160,
                                # So make sure volumes are present in etcd
                                # while raising volume down alert
                                VOLUME_TTL = (volume_count * 20) + (
                                    total_brick_count * 10) + 160
                            break
                    # populate the volume specific options
                    reg_ex = re.compile("^volume[0-9]+.options+")
                    options = {}
                    for key in volumes.keys():
                        if reg_ex.match(key):
                            options[key] = volumes[key]
                    for key in options.keys():
                        volname = key.split('.')[0]
                        vol_id = volumes['%s.id' % volname]
                        dict1 = {}
                        for k, v in options.items():
                            if k.startswith('%s.options' % volname):
                                dict1['.'.join(k.split(".")[2:])] = v
                                options.pop(k, None)
                        volume = NS.tendrl.objects.GlusterVolume(
                            NS.tendrl_context.integration_id,
                            vol_id=vol_id
                        ).load()
                        if volume.options is not None:
                            dest = dict(volume.options)
                            dest.update(dict1)
                            volume.options = dest
                            volume.save()

                # Sync cluster global details
                if "provisioner/%s" % NS.tendrl_context.integration_id \
                    in NS.node_context.tags:
                    all_volumes = NS.tendrl.objects.GlusterVolume(
                        NS.tendrl_context.integration_id
                    ).load_all() or []
                    volumes = []
                    for volume in all_volumes:
                        if not str(volume.deleted).lower() == "true" and \
                            volume.current_job.get('status', '') \
                            in ['', 'finished', 'failed'] and \
                            volume.vol_id not in [None, ''] and \
                            volume.name not in [None, '']:
                            # only for first sync refresh volume TTL
                            # It will increase TTL based on no.of volumes
                            if _cnc.first_sync_done in [None, "no", ""]:
                                etcd_utils.refresh(
                                    volume.value,
                                    SYNC_TTL + VOLUME_TTL
                                )
                            volumes.append(volume)
                    cluster_status.sync_cluster_status(
                        volumes, SYNC_TTL + VOLUME_TTL
                    )
                    utilization.sync_utilization_details(volumes)
                    client_connections.sync_volume_connections(volumes)
                    georep_details.aggregate_session_status()
                    try:
                        evt.process_events()
                    except etcd.EtcdKeyNotFound:
                        pass
                    rebalance_status.sync_volume_rebalance_status(volumes)
                    rebalance_status.sync_volume_rebalance_estimated_time(
                        volumes
                    )
                    snapshots.sync_volume_snapshots(
                        raw_data['Volumes'],
                        int(NS.config.data.get(
                            "sync_interval", 10
                        )) + len(volumes) * 4
                    )
                    # update alert count
                    update_cluster_alert_count()
                # check and enable volume profiling
                if "provisioner/%s" % NS.tendrl_context.integration_id in \
                    NS.node_context.tags:
                    self._update_volume_profiling()

                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                if _cluster.exists():
                    _cluster = _cluster.load()
                    _cluster.last_sync = str(tendrl_now())
                    # Mark the first sync done flag
                    _cnc = NS.tendrl.objects.ClusterNodeContext(
                        node_id=NS.node_context.node_id
                    ).load()
                    if _cnc.first_sync_done in [None, "no"]:
                        _cnc.first_sync_done = "yes"
                        _cnc.save()
                    if _cluster.current_job.get(
                        'status', ''
                    ) in ['', 'finished', 'failed'] and \
                        _cluster.status in [None, ""]:
                        _cluster.save()
            except Exception as ex:
                Event(
                    ExceptionMessage(
                        priority="error",
                        publisher=NS.publisher_id,
                        payload={"message": "gluster sds state sync error",
                                 "exception": ex
                                 }
                    )
                )
            try:
                etcd_utils.read(
                    '/clusters/%s/_sync_now' %
                    NS.tendrl_context.integration_id
                )
                continue
            except etcd.EtcdKeyNotFound:
                pass

            time.sleep(_sleep)

        logger.log(
            "debug",
            NS.publisher_id,
            {"message": "%s complete" % self.__class__.__name__}
        )
Exemplo n.º 44
0
    def run(self):
        if "Node[]" not in self.parameters:
            integration_id = self.parameters['TendrlContext.integration_id']
            short_name = self.parameters.get('Cluster.short_name', None)
            if short_name:
                if not re.match('^[a-zA-Z0-9][A-Za-z0-9_]*$',
                                short_name) or \
                   len(short_name) > 64:
                    raise FlowExecutionFailedError(
                        "Invalid cluster short_name: %s. "
                        "Only alpha-numeric and underscore "
                        "allowed for short name, max length 64 chars" %
                        short_name
                    )
            # Check for uniqueness of cluster short name
            _clusters = NS._int.client.read(
                '/clusters'
            )
            for entry in _clusters.leaves:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=entry.key.split('/')[-1]
                ).load()
                if _cluster.short_name and short_name and \
                    _cluster.is_managed == 'yes' and \
                    _cluster.short_name == short_name.strip().lower():
                    raise FlowExecutionFailedError(
                        "Cluster with name: %s already exists" % short_name
                    )
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            if (_cluster.status is not None and
                    _cluster.status != "" and
                    _cluster.current_job['status'] == 'in_progress' and
                    _cluster.status in
                    ["importing", "unmanaging", "expanding"]):
                raise FlowExecutionFailedError(
                    "Another job in progress for cluster, please wait till "
                    "the job finishes (job_id: %s) (integration_id: %s) " % (
                        _cluster.current_job['job_id'],
                        _cluster.integration_id
                    )
                )

            if short_name not in [None, ""]:
                _cluster.short_name = short_name
            else:
                _cluster.short_name = integration_id
            _cluster.status = "importing"
            _cluster.current_job = {
                'job_id': self.job_id,
                'job_name': self.__class__.__name__,
                'status': 'in_progress'
            }
            _cluster.save()

            try:
                integration_id_index_key = \
                    "indexes/tags/tendrl/integration/%s" % integration_id
                _node_ids = etcd_utils.read(
                    integration_id_index_key).value
                self.parameters["Node[]"] = json.loads(_node_ids)
            except etcd.EtcdKeyNotFound:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id).load()
                _cluster.status = ""
                _cluster.current_job['status'] = 'failed'
                _cluster.save()
                raise FlowExecutionFailedError("Cluster with "
                                               "integration_id "
                                               "(%s) not found, cannot "
                                               "import" % integration_id)
            else:
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                _cluster.volume_profiling_flag = self.parameters[
                    'Cluster.volume_profiling_flag']
                _cluster.save()

        try:
            super(ImportCluster, self).run()
            # Check if this job is parent and then only set status
            # This could be called from parent import cluster or
            # even from expand cluster flow. We should not set the
            # cluster's current job status from child jobs
            _job = NS.tendrl.objects.Job(job_id=self.job_id).load()
            if 'parent' not in _job.payload and _job.status != "failed":
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=NS.tendrl_context.integration_id
                ).load()
                _cluster.status = ""
                _cluster.current_job['status'] = "finished"
                _cluster.is_managed = "yes"
                _cluster.save()
        except (FlowExecutionFailedError,
                AtomExecutionFailedError,
                Exception) as ex:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=NS.tendrl_context.integration_id).load()
            _cluster.status = ""
            _cluster.current_job['status'] = 'failed'
            _errors = []
            if hasattr(ex, 'message'):
                _errors = [ex.message]
            else:
                _errors = [str(ex)]
            if _errors:
                _cluster.errors = _errors
            _cluster.save()
            raise FlowExecutionFailedError(str(
                traceback.format_exception(exc_type,
                                           exc_value,
                                           exc_traceback)
            ))
Exemplo n.º 45
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id
        ).load()
        if _cluster.status is not None and _cluster.status != "" and \
            _cluster.status in ["importing", "unmanaging", "expanding"]:
            raise FlowExecutionFailedError(
                "Another job in progress for cluster, please wait till "
                "the job finishes (job_id: %s) (integration_id: %s) " % (
                    _cluster.current_job['job_id'],
                    integration_id
                )
            )
        _lock_details = {
            'node_id': NS.node_context.node_id,
            'fqdn': NS.node_context.fqdn,
            'tags': NS.node_context.tags,
            'type': NS.type,
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.locked_by = _lock_details
        _cluster.status = "expanding"
        _cluster.current_job = {
            'job_id': self.job_id,
            'job_name': self.__class__.__name__,
            'status': 'in_progress'
        }
        _cluster.save()

        try:
            integration_id_index_key = \
                "indexes/tags/tendrl/integration/%s" % integration_id
            node_ids = etcd_utils.read(
                integration_id_index_key).value
            node_ids = json.loads(node_ids)
        except etcd.EtcdKeyNotFound:
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id
            ).load()
            _cluster.locked_by = {}
            _cluster.status = "expand_pending"
            _cluster.current_job = {
                'job_id': self.job_id,
                'job_name': self.__class__.__name__,
                'status': 'failed'
            }
            _cluster.save()
            raise FlowExecutionFailedError(
                "Cluster with integration_id "
                "(%s) not found, cannot "
                "import" % integration_id
            )

        job_ids = []
        new_peers = []
        # Remove the current node from list as its already participating
        # in cluster for sure
        node_ids.remove(NS.node_context.node_id)
        for node_id in node_ids:
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=node_id
            ).load()
            if _cnc.is_managed not in [None, ""] \
                and _cnc.is_managed.lower() == "yes":
                continue

            params = {
                'TendrlContext.integration_id': integration_id,
                'Node[]': [node_id],
                'Cluster.volume_profiling_flag':
                _cluster.volume_profiling_flag
            }
            payload = {
                "tags": ["tendrl/node_%s" % node_id],
                "run": "tendrl.flows.ImportCluster",
                "status": "new",
                "parent": self.parameters['job_id'],
                "parameters": params,
                "type": "node"
            }
            _job_id = str(uuid.uuid4())
            NS.tendrl.objects.Job(
                job_id=_job_id, status="new", payload=payload
            ).save()
            logger.log(
                "info",
                NS.publisher_id,
                {
                    "message": "ImportCluster %s (jobID: %s) : "
                    "importing host %s" % (
                        _cluster.short_name,
                        _job_id,
                        node_id
                    )
                },
                job_id=self.parameters['job_id']
            )
            job_ids.append(_job_id)
            new_peers.append(node_id)

        loop_count = 0
        # Wait for (no of nodes) * 6 minutes for import to complete
        wait_count = len(job_ids) * 36
        while True:
            child_jobs_failed = []
            if loop_count >= wait_count:
                logger.log(
                    "info",
                    NS.publisher_id,
                    {
                        "message": "Import jobs not yet complete "
                        "on all new nodes %s on cluster %s. Timing out. " %
                        (str(node_ids), _cluster.short_name)
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                _cluster = NS.tendrl.objects.Cluster(
                    integration_id=integration_id
                ).load()
                _cluster.locked_by = {}
                _cluster.status = "expand_pending"
                _cluster.current_job = {
                    'job_id': self.job_id,
                    'job_name': self.__class__.__name__,
                    'status': 'failed'
                }
                _cluster.save()
                raise FlowExecutionFailedError(
                    "Failed to expand cluster with integration_id "
                    "(%s)" % integration_id
                )

            time.sleep(10)
            finished = True
            for job_id in job_ids:
                job = NS.tendrl.objects.Job(job_id=job_id).load()
                if job.status not in ["finished", "failed"]:
                    finished = False
                elif job.status == "failed":
                    child_jobs_failed.append(job.job_id)
            if finished:
                break
            else:
                loop_count += 1
                continue
        if len(child_jobs_failed) > 0:
            _msg = "Child jobs failed are %s" % child_jobs_failed
            logger.log(
                "error",
                NS.publisher_id,
                {"message": _msg},
                job_id=self.parameters['job_id'],
                flow_id=self.parameters['flow_id']
            )
            _cluster = NS.tendrl.objects.Cluster(
                integration_id=integration_id
            ).load()
            _cluster.status = "expand_pending"
            _cluster.locked_by = {}
            _cluster.current_job = {
                'status': "failed",
                'job_name': self.__class__.__name__,
                'job_id': self.job_id
            }
            _cluster.save()
            raise FlowExecutionFailedError(
                "Failed to expand cluster with integration_id "
                "(%s)" % integration_id
            )
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id
        ).load()
        _cluster.status = ""
        _cluster.locked_by = {}
        _cluster.current_job = {
            'status': "finished",
            'job_name': self.__class__.__name__,
            'job_id': self.job_id
        }
        _cluster.save()

        logger.log(
            "info",
            NS.publisher_id,
            {
                "message": "Newly detected nodes: %s added to the "
                "cluster %s)" % (
                    str(new_peers),
                    _cluster.short_name
                ),
            },
            job_id=self.parameters['job_id'],
            flow_id=self.parameters['flow_id']
        )
        return True
Exemplo n.º 46
0
    def run(self):
        logger.log(
            "info",
            NS.publisher_id,
            {
                "message": "Deleting cluster details."
            },
            job_id=self.parameters['job_id'],
            flow_id=self.parameters['flow_id'],
        )
        integration_id = self.parameters['TendrlContext.integration_id']

        etcd_keys_to_delete = []
        etcd_keys_to_delete.append(
            "/clusters/%s/nodes" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/Bricks" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/Volumes" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/GlobalDetails" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/TendrlContext" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/Utilization" % integration_id
        )
        etcd_keys_to_delete.append(
            "/clusters/%s/raw_map" % integration_id
        )
        etcd_keys_to_delete.append(
            "/alerting/clusters/%s" % integration_id
        )
        nodes = etcd_utils.read(
            "/clusters/%s/nodes" % integration_id
        )
        node_ids = []
        for node in nodes.leaves:
            node_id = node.key.split("/")[-1]
            node_ids.append(node_id)
            key = "/alerting/nodes/%s" % node_id
            etcd_keys_to_delete.append(
                key
            )
            try:
                # delete node alerts from /alerting/alerts
                node_alerts = etcd_utils.read(key)
                for node_alert in node_alerts.leaves:
                    etcd_keys_to_delete.append(
                        "/alerting/alerts/%s" % node_alert.key.split(
                            "/")[-1]
                    )
            except etcd.EtcdKeyNotFound:
                # No node alerts, continue
                pass

        # Find the alerting/alerts entries to be deleted
        try:
            cluster_alert_ids = etcd_utils.read(
                "/alerting/clusters/%s" % integration_id
            )
            for entry in cluster_alert_ids.leaves:
                ca_id = entry.key.split("/")[-1]
                etcd_keys_to_delete.append(
                    "/alerting/alerts/%s" % ca_id
                )
        except etcd.EtcdKeyNotFound:
            # No cluster alerts, continue
            pass

        # Remove the cluster details
        for key in list(set(etcd_keys_to_delete)):
            try:
                etcd_utils.delete(key, recursive=True)
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "debug",
                    NS.publisher_id,
                    {
                        "message": "%s key not found for deletion" %
                        key
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )
                continue
        # remove short name
        cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id
        ).load()
        cluster.short_name = ""
        cluster.save()

        return True
Exemplo n.º 47
0
    def snapshot_restored(self, event):
        time.sleep(self.sync_interval)
        message = event["message"]
        volume = message['volume_name']
        volume_id = ""
        bricks_to_remove = []

        # get the list of current bricks by running get-state
        output_dir = '/var/run/'
        output_file = 'glusterd-state-snapshot-%s' % str(uuid.uuid4())
        subprocess.call(
            [
                'gluster',
                'get-state',
                'glusterd',
                'odir',
                output_dir,
                'file',
                output_file,
                'detail'
            ]
        )
        raw_data = ini2json.ini_to_dict(
            output_dir + output_file
        )
        subprocess.call(['rm', '-rf', output_dir + output_file])
        index = 1
        while True:
            try:
                current_vol = 'volume%s.name' % index
                if raw_data['Volumes'][current_vol] == volume:
                    current_vol_id = 'volume%s.id' % index
                    volume_id = raw_data['Volumes'][current_vol_id]
                    break
            except KeyError:
                return
            index += 1
        latest_bricks = []
        b_index = 1
        while True:
            try:
                curr_brick = 'volume%s.brick%s.path' % (
                    index, b_index
                )
                brick = raw_data['Volumes'][curr_brick]
                b_index += 1
            except KeyError:
                break
            latest_bricks.append(brick)

        # get the list of bricks in etcd for this volume

        sub_volumes = etcd_utils.read(
            "/clusters/{0}/Volumes/{1}/Bricks".format(
                NS.tendrl_context.integration_id,
                volume_id
            )
        )
        for sub_volume in sub_volumes.leaves:
            bricks = etcd_utils.read(
                sub_volume.key
            )
            for brick in bricks.leaves:
                fqdn = brick.key.split('/')[-1].split(':')[0]
                path = brick.key.split('/')[-1].split(':')[-1][1:]

                brick_path = "clusters/{0}/Bricks/"\
                             "all/{1}/{2}".format(
                                 NS.tendrl_context.integration_id,
                                 fqdn,
                                 path
                             )
                brick_full_path = etcd_utils.read(
                    "%s/brick_path" % brick_path
                ).value
                if brick_full_path not in latest_bricks:
                    bricks_to_remove.append(brick_full_path)

        brick_details = {}
        brick_details["volume"] = volume
        brick_details["bricks"] = " ".join(bricks_to_remove)
        event["message"] = brick_details
        self.volume_remove_brick_force(event)
Exemplo n.º 48
0
    def run(self):
        node_ids = self.parameters.get('Node[]')
        if not node_ids or len(node_ids) == 0:
            raise AtomExecutionFailedError("Node[] cannot be empty")

        for node_id in node_ids:
            # Check if node has the OS details populated
            try:
                os_details = etcd_utils.read("nodes/%s/Os" % node_id)
                if os_details.leaves is None:
                    logger.log(
                        "error",
                        NS.get("publisher_id", None),
                        {
                            "message": "Node %s doesn't have OS details "
                                       "populated" % NS.node_context.fqdn
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id']
                    )
                    return False
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "error",
                    NS.get("publisher_id", None),
                    {
                        "message": "Node %s doesn't have OS details "
                                   "populated" %
                                   NS.node_context.fqdn
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

            # Check if node has the CPU details populated
            try:
                cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id)
                if cpu_details.leaves is None:
                    logger.log(
                        "error",
                        NS.get("publisher_id", None),
                        {
                            "message": "Node %s doesn't have CPU details "
                                       "populated" % NS.node_context.fqdn
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id']
                    )
                    return False
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "error",
                    NS.get("publisher_id", None),
                    {
                        "message": "Node %s doesn't have CPU details "
                                   "populated" % NS.node_context.fqdn
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

            # Check if node has the Memory populated
            try:
                memory_details = etcd_utils.read(
                    "nodes/%s/Memory" % node_id
                )
                if memory_details.leaves is None:
                    logger.log(
                        "error",
                        NS.get("publisher_id", None),
                        {
                            "message": "Node %s doesn't have Memory details "
                                       "populated" % NS.node_context.fqdn
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id']
                    )
                    return False

            except etcd.EtcdKeyNotFound:
                logger.log(
                    "error",
                    NS.get("publisher_id", None),
                    {
                        "message": "Node %s doesn't have Memory details "
                                   "populated" % NS.node_context.fqdn
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

            # Check if node has networks details populated
            try:
                networks = etcd_utils.read("nodes/%s/Networks" % node_id)
                if networks.leaves is None:
                    logger.log(
                        "error",
                        NS.get("publisher_id", None),
                        {
                            "message": "Node %s doesn't have network details "
                                       "populated" % NS.node_context.fqdn
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id']
                    )
                    return False
            except etcd.EtcdKeyNotFound:
                logger.log(
                    "error",
                    NS.get("publisher_id", None),
                    {
                        "message": "Node %s doesn't have network details "
                                   "populated" % NS.node_context.fqdn
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False

        return True
Exemplo n.º 49
0
    def run(self):
        integration_id = self.parameters['TendrlContext.integration_id']
        _cluster = NS.tendrl.objects.Cluster(
            integration_id=integration_id
        ).load()

        try:
            # Get the cluster nodes
            nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id)
            child_job_ids = []
            node_ids = []
            for node in nodes.leaves:
                node_id = node.key.split("/")[-1]
                node_ids.append(node_id)
                # Create jobs on nodes for stoping services
                _job_id = str(uuid.uuid4())
                params = {
                    "Services[]": ["tendrl-gluster-integration"]
                }
                payload = {
                    "tags": ["tendrl/node_%s" % node_id],
                    "run": "tendrl.objects.Node.flows.StopServices",
                    "status": "new",
                    "parameters": params,
                    "parent": self.parameters["job_id"],
                    "type": "node"
                }
                NS.tendrl.objects.Job(
                    job_id=_job_id,
                    status="new",
                    payload=payload
                ).save()
                child_job_ids.append(_job_id)
                logger.log(
                    "info",
                    NS.publisher_id,
                    {
                        "message": "Stop tendrl services (job: %s) "
                        "on %s in cluster %s" %
                        (_job_id, node_id, _cluster.short_name)
                    },
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id'],
                )

            # Wait for (no of nodes) * 10 secs for stop service job to complete
            loop_count = 0
            wait_count = (len(child_job_ids)) * 2
            while True:
                child_jobs_failed = []
                if loop_count >= wait_count:
                    logger.log(
                        "error",
                        NS.publisher_id,
                        {
                            "message": "Stop service jobs on cluster(%s) not "
                            "yet complete on all nodes(%s). Timing out. "
                            % (_cluster.short_name, str(node_ids))
                        },
                        job_id=self.parameters['job_id'],
                        flow_id=self.parameters['flow_id'],
                    )
                    # Marking child jobs as failed which did not complete as
                    # the parent job has timed out. This has to be done
                    # explicitly because these jobs will still be processed
                    # by the node-agent, and will keep it busy, which might
                    # defer the new jobs or lead to their timeout.
                    for child_job_id in child_job_ids:
                        child_job = NS.tendrl.objects.Job(
                            job_id=child_job_id
                        ).load()
                        if child_job.status not in ["finished", "failed"]:
                            child_job.status = "failed"
                            child_job.save()
                    return False
                time.sleep(5)
                finished = True
                for child_job_id in child_job_ids:
                    child_job = NS.tendrl.objects.Job(
                        job_id=child_job_id
                    ).load()
                    if child_job.status not in ["finished", "failed"]:
                        finished = False
                    elif child_job.status == "failed":
                        child_jobs_failed.append(child_job.job_id)
                if finished:
                    break
                else:
                    loop_count += 1
                    continue
            if len(child_jobs_failed) > 0:
                _msg = "Child jobs failed are %s" % child_jobs_failed
                logger.log(
                    "error",
                    NS.publisher_id,
                    {"message": _msg},
                    job_id=self.parameters['job_id'],
                    flow_id=self.parameters['flow_id']
                )
                return False
        except etcd.EtcdKeyNotFound:
            pass

        return True
Exemplo n.º 50
0
 def run(self):
     try:
         all_node_status_up = True
         # check job is parent or child
         job = NS.tendrl.objects.Job(
             job_id=self.parameters['job_id']
         ).load()
         if "parent" not in job.payload:
             # fetch node id using integration_id
             integration_id = self.parameters[
                 'TendrlContext.integration_id'
             ]
             key = "indexes/tags/tendrl/integration/%s" % \
                 integration_id
             node_ids_str = etcd_utils.read(key).value
             node_ids = json.loads(node_ids_str)
             # identifying node status using node_id
             logger.log(
                 "info",
                 NS.publisher_id,
                 {"message": "Checking status of nodes %s" % str(node_ids)},
                 job_id=self.parameters['job_id'],
                 flow_id=self.parameters['flow_id']
             )
             nodes_up = []
             nodes_down = []
             for node in node_ids:
                 node = str(node)
                 # if node_context not found it will give status DOWN
                 node_context = NS.tendrl.objects.NodeContext(
                     node_id=node,
                     status='DOWN'
                 ).load()
                 if node_context.status == "UP":
                     nodes_up.append(node)
                 else:
                     all_node_status_up = False
                     nodes_down.append(node)
             if all_node_status_up:
                 logger.log(
                     "info",
                     NS.publisher_id,
                     {"message": "Nodes %s are up" % nodes_up},
                     job_id=self.parameters['job_id'],
                     flow_id=self.parameters['flow_id']
                 )
             else:
                 logger.log(
                     "info",
                     NS.publisher_id,
                     {"message": "Nodes %s are down" %
                      nodes_down},
                     job_id=self.parameters['job_id'],
                     flow_id=self.parameters['flow_id']
                 )
         # no need to check for child job
         return all_node_status_up
     except (etcd.EtcdKeyNotFound, TypeError) as ex:
         logger.log(
             "error",
             NS.get("publisher_id", None),
             {
                 "message": "Error checking status of nodes. Error: %s"
                            % str(ex)
             },
             job_id=self.parameters['job_id'],
             flow_id=self.parameters['flow_id']
         )
         return False
Exemplo n.º 51
0
    def volume_delete(self, event):
        time.sleep(self.sync_interval)
        fetched_volumes = NS.tendrl.objects.GlusterVolume(
            NS.tendrl_context.integration_id
        ).load_all()
        for fetched_volume in fetched_volumes:
            if fetched_volume.name == event['message']['name']:
                fetched_volume.deleted = True
                fetched_volume.deleted_at = time_utils.now()
                fetched_volume.save()
                try:
                    sub_volumes = etcd_utils.read(
                        "/clusters/{0}/Volumes/{1}/Bricks".format(
                            NS.tendrl_context.integration_id,
                            fetched_volume.vol_id
                        )
                    )

                    for sub_volume in sub_volumes.leaves:
                        bricks = etcd_utils.read(
                            sub_volume.key
                        )
                        for brick in bricks.leaves:
                            fqdn = brick.key.split('/')[-1].split(':')[0]
                            path = brick.key.split('/')[-1].split(':')[-1][1:]
                            # Delete brick dashboard from grafana
                            brick_obj = NS.tendrl.objects.GlusterBrick(
                                NS.tendrl_context.integration_id,
                                fqdn,
                                path
                            ).load()
                            # Delete brick
                            brick_path = "clusters/{0}/Bricks/"\
                                         "all/{1}/{2}".format(
                                             NS.tendrl_context.integration_id,
                                             fqdn,
                                             path
                                         )
                            etcd_utils.delete(
                                brick_path,
                                recursive=True
                            )
                            brick_full_path = fqdn + ":" + brick_obj.\
                                brick_path.split(":")[-1]
                            job_id = monitoring_utils.update_dashboard(
                                "%s|%s" % (
                                    event['message']['name'],
                                    brick_full_path
                                ),
                                RESOURCE_TYPE_BRICK,
                                NS.tendrl_context.integration_id,
                                "delete"
                            )
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {
                                    "message": "Update dashboard job %s"
                                    " for brick %s "
                                    "in cluster %s created" % (
                                        job_id,
                                        brick.key.split('/')[-1],
                                        NS.tendrl_context.integration_id
                                    )
                                }
                            )
                            # Delete brick from graphite
                            job_id = monitoring_utils.\
                                delete_resource_from_graphite(
                                    "%s|%s" % (
                                        event['message']['name'],
                                        brick_full_path
                                    ),
                                    RESOURCE_TYPE_BRICK,
                                    NS.tendrl_context.integration_id,
                                    "delete"
                                )
                            logger.log(
                                "debug",
                                NS.publisher_id,
                                {
                                    "message": "Delete resource "
                                    "from graphite job %s "
                                    "for brick %s in cluster %s created" % (
                                        job_id,
                                        brick.key.split('/')[-1],
                                        NS.tendrl_context.integration_id
                                    )
                                }
                            )
                except etcd.EtcdKeyNotFound:
                    pass
        # Delete volume dashboard from grafana
        job_id = monitoring_utils.update_dashboard(
            event['message']['name'],
            RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id,
            "delete"
        )
        logger.log(
            "debug",
            NS.publisher_id,
            {
                "message": "Update dashboard job %s "
                "created" % job_id
            }
        )
        # Delete volume details from graphite
        job_id = monitoring_utils.delete_resource_from_graphite(
            event['message']['name'],
            RESOURCE_TYPE_VOLUME,
            NS.tendrl_context.integration_id,
            "delete"
        )
        logger.log(
            "debug",
            NS.publisher_id,
            {
                "message": "Delete resource from graphite job %s "
                "created" % job_id
            }
        )
Exemplo n.º 52
0
    def on_change(self, attr, prev_value, current_value):
        if attr == "status" and "tendrl/monitor" in NS.node_context.tags:
            _tc = NS.tendrl.objects.TendrlContext(
                node_id=self.node_id
            ).load()
            # Check node is managed
            _cnc = NS.tendrl.objects.ClusterNodeContext(
                node_id=self.node_id,
                integration_id=_tc.integration_id
            ).load()
            if current_value is None and str(_cnc.is_managed).lower() == "yes":
                self.status = "DOWN"
                self.save()
                msg = "Node {0} is DOWN".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    self.status,
                    msg,
                    "node_{0}".format(self.fqdn),
                    "WARNING",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
                # Load cluster_node_context will load node_context
                # and it will be updated with latest values
                _cnc_new = \
                    NS.tendrl.objects.ClusterNodeContext(
                        node_id=self.node_id,
                        integration_id=_tc.integration_id,
                        first_sync_done=_cnc.first_sync_done,
                        is_managed=_cnc.is_managed
                    )
                _cnc_new.save()
                del _cnc_new
                # Update cluster details
                self.update_cluster_details(_tc.integration_id)
                _tag = "provisioner/%s" % _tc.integration_id
                if _tag in self.tags:
                    _index_key = "/indexes/tags/%s" % _tag
                    self.tags.remove(_tag)
                    self.save()
                    etcd_utils.delete(_index_key)
                if _tc.sds_name in ["gluster", "RHGS"]:
                    bricks = etcd_utils.read(
                        "clusters/{0}/Bricks/all/{1}".format(
                            _tc.integration_id,
                            self.fqdn
                        )
                    )

                    for brick in bricks.leaves:
                        try:
                            etcd_utils.write(
                                "{0}/status".format(brick.key),
                                "Stopped"
                            )
                        except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound):
                            pass
            elif current_value == "UP" and str(
                    _cnc.is_managed).lower() == "yes":
                msg = "{0} is UP".format(self.fqdn)
                event_utils.emit_event(
                    "node_status",
                    "UP",
                    msg,
                    "node_{0}".format(self.fqdn),
                    "INFO",
                    node_id=self.node_id,
                    integration_id=_tc.integration_id
                )
            del _cnc
Exemplo n.º 53
0
def _derive_volume_states(volumes):
    out_dict = {}
    for volume in volumes:
        if volume.status == "Stopped":
            out_dict[volume.vol_id] = "down"
        else:
            subvol_count = 0
            bricks = []
            subvol_states = []
            while True:
                try:
                    subvol = etcd_utils.read(
                        "clusters/%s/Volumes/%s/Bricks/subvolume%s" % (
                            NS.tendrl_context.integration_id,
                            volume.vol_id,
                            subvol_count
                        )
                    )
                    state = 0
                    for entry in subvol.leaves:
                        brick_name = entry.key.split("/")[-1]
                        fetched_brick = NS.tendrl.objects.GlusterBrick(
                            NS.tendrl_context.integration_id,
                            brick_name.split(":")[0],
                            brick_name.split(":_")[-1]
                        ).load()
                        if not fetched_brick.status:
                            fetched_brick.status = "Stopped"
                        bricks.append(fetched_brick)
                        if fetched_brick.status != "Started":
                            state += 1
                    subvol_states.append(state)
                    subvol_count += 1
                except etcd.EtcdKeyNotFound:
                    break

            total_bricks = len(bricks)
            up_bricks = 0
            for brick in bricks:
                if brick.status == "Started":
                    up_bricks += 1
            if total_bricks == 0 or total_bricks < int(volume.brick_count):
                # No brick details updated for the volume yet
                out_dict[volume.vol_id] = 'unknown'
            elif up_bricks == 0:
                out_dict[volume.vol_id] = 'down'
            else:
                out_dict[volume.vol_id] = 'up'
                if int(volume.replica_count) > 1 or \
                    int(volume.disperse_count) > 0:
                    worst_subvol = max(subvol_states)
                    if worst_subvol > 0:
                        subvol_prob = max(
                            int(volume.replica_count),
                            int(volume.redundancy_count) + 1
                        )
                        if worst_subvol == subvol_prob:
                            # if this volume contains only one subvolume,
                            # and the bricks down > redundancy level
                            # then the volume state needs to show down
                            if subvol_count == 1:
                                out_dict[volume.vol_id] = 'down'
                            else:
                                out_dict[volume.vol_id] = '(partial)'
                        else:
                            out_dict[volume.vol_id] = '(degraded)'
                else:
                    # This volume is not 'protected', so any brick
                    # disruption leads straight to a 'partial'
                    # availability state
                    if up_bricks != total_bricks:
                        out_dict[volume.vol_id] = '(partial)'
        # Raise the alert if volume state changes
        if volume.state != "" and \
            out_dict[volume.vol_id] not in [volume.state, 'unknown']:
            msg = "Volume:%s is %s" % (volume.name, out_dict[volume.vol_id])
            instance = "volume_%s" % volume.name
            event_utils.emit_event(
                "volume_state",
                out_dict[volume.vol_id],
                msg,
                instance,
                'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING',
                tags={"entity_type": RESOURCE_TYPE_VOLUME,
                      "volume_name": volume.name
                      }
            )
        # Save the volume status
        volume.state = out_dict[volume.vol_id]
        volume.save()

    return out_dict