def run(self): logger.log( "debug", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) while not self._complete.is_set(): _job_sync_interval = 5 NS.node_context = NS.node_context.load() if "tendrl/monitor" in NS.node_context.tags: _job_sync_interval = 3 time.sleep(_job_sync_interval) try: jobs = etcd_utils.read("/queue") except etcd.EtcdKeyNotFound: continue for job in jobs.leaves: # Check job not already locked by some agent jid = job.key.split('/')[-1] job_lock_key = "/queue/%s/locked_by" % jid try: _locked_by = etcd_utils.read(job_lock_key).value if _locked_by: continue except etcd.EtcdKeyNotFound: pass _job_thread = threading.Thread( target=process_job, args=(jid) ) _job_thread.daemon = True _job_thread.start() _job_thread.join()
def get_volumes_details(cluster_key): volume_details = [] try: volume_list = utils.get_resource_keys(cluster_key, "Volumes") for volume_id in volume_list: deleted = etcd_utils.read(cluster_key + "/Volumes/" + str(volume_id) + "/" + "deleted").value if str(deleted).lower() != "true": volume_data = {} for attr in ATTRS["volumes"]: volume_data[attr] = etcd_utils.read(cluster_key + "/Volumes/" + str(volume_id) + "/" + attr).value subvolume_key = cluster_key + "/Volumes/" + str(volume_id) subvolume_details = get_subvolume_details(subvolume_key) volume_data["subvolume"] = subvolume_details volume_details.append(volume_data) except (KeyError, etcd.EtcdKeyNotFound) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Error while fetching " "volume id {}".format(volume_id) + str(ex) }) return volume_details
def get_alert_destinations(self, key): email_ids = [] email_notifications = etcd_utils.read(key) for email_notification in email_notifications.leaves: email = etcd_utils.read(email_notification.key).value email_ids.append(email) return email_ids
def exists(self): self.render() _exists = True try: etcd_utils.read("/{0}".format(self.value)) except etcd.EtcdKeyNotFound: _exists = False return _exists
def run(self): node_ids = self.parameters.get('Node[]') if not node_ids or len(node_ids) == 0: raise AtomExecutionFailedError("Node[] cannot be empty") for node_id in node_ids: # Check if node has the OS details populated try: os_details = etcd_utils.read("nodes/%s/Os" % node_id) if os_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have OS details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have OS details populated" ) # Check if node has the CPU details populated try: cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id) if cpu_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have CPU details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have CPU details populated" ) # Check if node has the Memory populated try: memory_details = etcd_utils.read( "nodes/%s/Memory" % node_id ) if memory_details.leaves is None: raise AtomExecutionFailedError( "Node doesnt have Memory details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have Memory details populated" ) # Check if node has networks details populated try: networks = etcd_utils.read("nodes/%s/Networks" % node_id) if networks.leaves is None: raise AtomExecutionFailedError( "Node doesnt have network details populated" ) except etcd.EtcdKeyNotFound: raise AtomExecutionFailedError( "Node doesnt have network details populated" ) return True
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] etcd_keys_to_delete = [] etcd_keys_to_delete.append("/clusters/%s/nodes" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Bricks" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Volumes" % integration_id) etcd_keys_to_delete.append("/clusters/%s/GlobalDetails" % integration_id) etcd_keys_to_delete.append("/clusters/%s/TendrlContext" % integration_id) etcd_keys_to_delete.append("/clusters/%s/Utilization" % integration_id) etcd_keys_to_delete.append("/clusters/%s/raw_map" % integration_id) etcd_keys_to_delete.append("/alerting/clusters/%s" % integration_id) nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id) node_ids = [] for node in nodes.leaves: node_id = node.key.split("/")[-1] node_ids.append(node_id) etcd_keys_to_delete.append("/alerting/nodes/%s" % node_id) # Find the alerting/alerts entries to be deleted try: cluster_alert_ids = etcd_utils.read("/alerting/clusters") for entry in cluster_alert_ids.leaves: ca_id = entry.key.split("/")[-1] etcd_keys_to_delete.append("/alerting/alerts/%s" % ca_id) except etcd.EtcdKeyNotFound: # No cluster alerts, continue pass try: node_alert_ids = etcd_utils.read("/alerting/nodes") for entry in node_alert_ids.leaves: na_id = entry.key.split("/")[-1] etcd_keys_to_delete.append("/alerting/alerts/%s" % na_id) except etcd.EtcdKeyNotFound: # No node alerts, continue pass # Remove the cluster details for key in list(set(etcd_keys_to_delete)): try: etcd_utils.delete(key, recursive=True) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, {"message": "%s key not found for deletion" % key}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) continue return True
def shutdown(signum, frame): logger.log( "debug", NS.publisher_id, {"message": "Signal handler: stopping"} ) # Remove the node's name from gluster server tag try: gl_srvr_list = etcd_utils.read( "/indexes/tags/gluster/server" ).value gl_srvr_list = json.loads(gl_srvr_list) if NS.node_context.node_id in gl_srvr_list: gl_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/gluster/server", json.dumps(gl_srvr_list) ) node_tags = json.loads(NS.node_context.tags) if 'provisioner/%s' % NS.tendrl_context.integration_id \ in node_tags: etcd_utils.delete( "/indexes/tags/provisioner/%s" % NS.tendrl_context.integration_id, recursive=True ) int_srvr_list = etcd_utils.read( "/indexes/tags/tendrl/integration/gluster" ).value int_srvr_list = json.loads(int_srvr_list) if NS.node_context.node_id in int_srvr_list: int_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/tendrl/integration/gluster", json.dumps(int_srvr_list) ) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Couldnt remove node from " "gluster servers list tag." "integration_id: %s, node_id: %s" % ( NS.tendrl_context.integration_id, NS.node_context.node_id ) } ) pass complete.set() m.stop()
def shutdown(signum, frame): logger.log( "debug", NS.publisher_id, {"message": "Signal handler: stopping"} ) # Remove the node's name from gluster server tag try: gl_srvr_list = etcd_utils.read( "/indexes/tags/gluster/server" ).value gl_srvr_list = json.loads(gl_srvr_list) if NS.node_context.node_id in gl_srvr_list: gl_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/gluster/server", json.dumps(gl_srvr_list) ) node_tags = NS.node_context.tags if 'provisioner/%s' % NS.tendrl_context.integration_id \ in node_tags: etcd_utils.delete( "/indexes/tags/provisioner/%s" % NS.tendrl_context.integration_id, recursive=True ) int_srvr_list = etcd_utils.read( "/indexes/tags/tendrl/integration/gluster" ).value int_srvr_list = json.loads(int_srvr_list) if NS.node_context.node_id in int_srvr_list: int_srvr_list.remove(NS.node_context.node_id) etcd_utils.write( "/indexes/tags/tendrl/integration/gluster", json.dumps(int_srvr_list) ) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Couldnt remove node from " "gluster servers list tag." "integration_id: %s, node_id: %s" % ( NS.tendrl_context.integration_id, NS.node_context.node_id ) } ) pass complete.set() m.stop()
def get_volume_details(self, objects, cluster_key): volume_detail = [] volume_list = utils.get_resource_keys(cluster_key, "Volumes") for volume in volume_list: resource_detail = {} volume_key = os.path.join(cluster_key, "Volumes", volume) volume_deleted_key = os.path.join(volume_key, "deleted") try: is_volume_deleted = etcd_utils.read(volume_deleted_key).value if is_volume_deleted.lower() == "true": continue except etcd.EtcdKeyNotFound: continue for key, value in objects["Volume"]["attrs"].items(): if value is None: try: attr_key = os.path.join(volume_key, key) attr_data = etcd_utils.read(attr_key) attr_value = self.resource_status_mapper( str(attr_data.value)) resource_detail[key] = attr_value except (KeyError, etcd.EtcdKeyNotFound) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Cannot Find {0} in volume " "{1}".format(key, volume) + str(ex) }) else: try: new_key = os.path.join( volume_key, objects["Volume"]["attrs"][key]["value"].rsplit( "/", 1)[1]) resp_data = self.get_object_from_central_store( new_key, objects["Volume"]["attrs"][key]) resource_detail[key] = resp_data except (etcd.EtcdKeyNotFound, AttributeError, KeyError) as ex: resource_detail[key] = { "total": 0, "up": 0, "down": 0, "partial": 0, "created": 0, "stopped": 0, "paused": 0 } if not resource_detail == {}: volume_detail.append(resource_detail) return volume_detail
def find_volume_id(vol_name, integration_id): try: volumes = etcd_utils.read("clusters/%s/Volumes" % integration_id) for volume in volumes.leaves: key = volume.key + "/name" name = etcd_utils.read(key).value if vol_name == name: return volume.key.split("/")[-1] except (EtcdKeyNotFound) as ex: logger.log("error", NS.publisher_id, { "message": "Failed to fetch volume id for volume name %s" % vol_name }) raise ex
def run(): try: clusters = etcd_utils.read("/clusters") except etcd.EtcdKeyNotFound: return # This logic only runs on node with tag `tendrl/monitor` (tendrl server) # So its done checked for all the clusters and tries to set cluster # status as `unhealthy` if the status field is missing (due to TTL) for cluster in clusters.leaves: int_id = cluster.key.split('/')[-1] fetched_cluster = NS.tendrl.objects.Cluster( integration_id=int_id ).load() try: if fetched_cluster and fetched_cluster.is_managed == "yes": NS._int.client.write( "/clusters/{0}/GlobalDetails/status".format( int_id ), "unhealthy", prevExist=False ) except etcd.EtcdAlreadyExist: pass return
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] # Wait for /indexes/tags/tendrl/integration/$integration_id # to appear. This means cluster is import ready wait_count = 6 loop_count = 0 while True: try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id _node_ids = etcd_utils.read( integration_id_index_key ).value if _node_ids: return True if loop_count >= wait_count: raise AtomExecutionFailedError( "Cluster: %s is not yet marked as " "import ready. Timing out." % integration_id ) except etcd.EtcdKeyNotFound: time.sleep(5) loop_count += 1 continue return True
def sync_volume_connections(volumes): for volume in volumes: subvol_count = 0 vol_connections = 0 while True: try: subvol = etcd_utils.read( "clusters/%s/Volumes/%s/Bricks/subvolume%s" % (NS.tendrl_context.integration_id, volume.vol_id, subvol_count)) if subvol: for entry in subvol.leaves: brick_name = entry.key.split("/")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, brick_name.split(":")[0], brick_name.split(":_")[-1]).load() if fetched_brick and fetched_brick.client_count: vol_connections += 0 \ if fetched_brick.client_count == '' \ else int(fetched_brick.client_count) subvol_count += 1 except etcd.EtcdKeyNotFound: break volume.client_count = vol_connections volume.save()
def refresh_dashboard(self): try: # check alert organization is exist if NS.config.data["org_id"]: cluster_details = {} dashboards = [] integration_ids = utils.get_resource_keys("", "clusters") for integration_id in integration_ids: key = "/clusters/%s/TendrlContext/sds_name" % \ integration_id sds_name = etcd_utils.read(key).value if sds_name == constants.GLUSTER: cluster_details, dashboards = gluster_cluster_details.\ get_cluster_details( integration_id ) cluster_details["sds_name"] = constants.GLUSTER self.update_dashboard(cluster_details, dashboards) else: # try to create alert organization once again alert_organization.create() except (etcd.EtcdException, KeyError, AttributeError, req_excep.ConnectionError, TypeError, req_excep.RequestException, exceptions.ConnectionFailedException, exceptions.AlertOrganizationNotFound) as ex: logger.log( "debug", NS.get("publisher_id", None), { 'message': "Failed to update cluster " "dashboard.err: %s" % str(ex) })
def _run(self): aggregate_gluster_objects = NS.monitoring.definitions.get_parsed_defs( )["namespace.monitoring"]["graphite_data"] while not self._complete.is_set(): if self.sync_interval is None: try: interval = etcd_utils.read( "_NS/gluster/config/data/sync_interval") try: self.sync_interval = float(interval.value) except ValueError as ex: logger.log( "error", NS.get("publisher_id", None), { 'message': "Unable to parse tendrl-gluster-integration config 'sync_interval' (value: %s)" % interval.value }) raise ex except etcd.EtcdKeyNotFound as ex: continue try: gevent.sleep(self.sync_interval) cluster_details = self.plugin_obj.get_central_store_data( aggregate_gluster_objects) metrics = graphite_utils.create_metrics( aggregate_gluster_objects, cluster_details) for metric in metrics: for key, value in metric.items(): if value: respose = self.plugin_obj.push_metrics(key, value) except (etcd.EtcdKeyNotFound, AttributeError, KeyError) as ex: logger.log("error", NS.get("publisher_id", None), {'message': str(ex)})
def load(self): self.render() if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(): return self _copy = self._copy_vars() # Check if self.value already set, use it if self.value.find('{') < 0: _copy.value = self.value key = _copy.value + '/data' try: val_str = etcd_utils.read(key).value except etcd.EtcdKeyNotFound: return self loc_dict = json.loads(val_str) for attr_name, attr_val in vars(_copy).iteritems(): _type = self._defs.get("attrs", {}).get(attr_name, {}).get("type") if loc_dict.get(attr_name) in [None, ""]: if _type and _type.lower() == 'list': setattr(_copy, attr_name, list()) if _type and _type.lower() == 'json': setattr(_copy, attr_name, dict()) else: if _type and _type.lower() in ['list']: setattr(_copy, attr_name, json.loads(loc_dict[attr_name])) else: setattr(_copy, attr_name, loc_dict[attr_name]) return _copy
def get_node_details(self, objects, integration_id): node_detail = [] _cluster_node_ids = etcd_utils.read("/clusters/%s/nodes" % integration_id) for _node_id in _cluster_node_ids.leaves: _cnc = NS.tendrl.objects.ClusterNodeContext( integration_id=integration_id, node_id=_node_id.key.split('/')[-1]).load() if _cnc.is_managed != "yes": continue resource_detail = {} for key, value in objects["Node"]["attrs"].items(): if value is None: attr_value = getattr(_cnc, key) if attr_value not in [None, ""]: attr_value = self.resource_status_mapper( str(getattr(_cnc, key))) resource_detail[key] = attr_value else: if key == 'status': _node_context = NS.tendrl.objects.NodeContext( node_id=_cnc.node_id).load() attr_value = self.resource_status_mapper( str(getattr(_node_context, 'status'))) resource_detail[key] = attr_value node_detail.append(resource_detail) return node_detail
def aggregate_session_status(): volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() georep_status = GeoReplicationSessionStatus() if volumes: for volume in volumes: vol_id = volume.vol_id sessions = None try: sessions = etcd_utils.read( "clusters/%s/Volumes/%s/GeoRepSessions" % ( NS.tendrl_context.integration_id, vol_id ) ) except etcd.EtcdKeyNotFound: continue pair_count = int(volume.brick_count) for session in sessions.leaves: session_status = None session_id = session.key.split("GeoRepSessions/")[-1] pairs = NS.gluster.objects.GeoReplicationPair( vol_id=vol_id, session_id=session_id ).load_all() faulty_count = 0 stopped_count = 0 paused_count = 0 created_count = 0 for pair in pairs: if pair.status.lower() == "faulty": faulty_count += 1 elif pair.status.lower() == "created": created_count += 1 elif pair.status.lower() == "stopped": stopped_count += 1 elif pair.status.lower() == "paused": paused_count += 1 if created_count == pair_count: session_status = georep_status.CREATED elif faulty_count == 0 and ( stopped_count == 0 and paused_count == 0 and ( created_count == 0 ) ): session_status = georep_status.UP elif pair_count == faulty_count: session_status = georep_status.DOWN elif stopped_count == pair_count: session_status = georep_status.STOPPED elif paused_count == pair_count: session_status = georep_status.PAUSED else: session_status = georep_status.PARTIAL NS.tendrl.objects.GeoReplicationSession( vol_id=vol_id, session_id=session_id, session_status=session_status ).save()
def get_cluster_details(self, objects, cluster_key): cluster_detail = [] for obj in objects["Cluster"]: if obj in ["metric", "value"]: continue resource_detail = {} resource_detail[str(obj)] = {} obj_details = objects["Cluster"][str(obj)] obj_key = os.path.join(cluster_key, str(obj)) obj_attrs = obj_details["attrs"] for key, _ in obj_attrs.items(): try: attr_key = os.path.join(obj_key, key) attr_data = etcd_utils.read(attr_key) attr_value = self.cluster_status_mapper( str(attr_data.value)) resource_detail[str(obj)][key] = copy.deepcopy(attr_value) except (KeyError, etcd.EtcdKeyNotFound) as ex: integration_id = cluster_key.split("/")[-1] logger.log( "debug", NS.get("publisher_id", None), { 'message': "Cannot Find {0} in Cluster " "{1}".format(key, integration_id) + str(ex) }) if not resource_detail == {}: cluster_detail.append(resource_detail) return cluster_detail
def find_node_id(integration_id, fqdn): try: nodes = etcd_utils.read("clusters/%s/nodes" % integration_id) for node in nodes.leaves: node_id = node.key.split('/')[-1] node_context = NS.tendrl.objects.ClusterNodeContext() # formating value here because render populate integration_id # from namespace node_context.value = node_context.value.format( integration_id, node_id) if fqdn == node_context.load().fqdn: return node_id raise NodeNotFound except (EtcdKeyNotFound, NodeNotFound) as ex: if type(ex) != EtcdKeyNotFound: logger.log("error", NS.publisher_id, {"message": "Failed to fetch fqdn for node %s" % fqdn}) else: logger.log( "error", NS.publisher_id, { "message": "Node with fqdn %s not found " "in cluster %s" % (fqdn, integration_id) }) raise ex
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext(node_id=self.node_id).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event("node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id) del _cnc
def find_node_id(integration_id, fqdn): _cluster_node_ids = etcd_utils.read("/clusters/%s/nodes" % integration_id) for _node_id in _cluster_node_ids.leaves: _cnc = NS.tendrl.objects.ClusterNodeContext( integration_id=integration_id, node_id=_node_id.key.split('/')[-1]).load() if _cnc.fqdn == fqdn: return _cnc.node_id
def test_read(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.client = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.reconnect = type("Dummy", (object, ), {}) with patch.object(Client, "read", return_value="test") as mock_read: obj = etcd_utils.read("key") assert obj == "test" assert mock_read.assert_called with patch.object(Client, "read", raise_etcdconnectionfailed) as mock_read: with pytest.raises(etcd.EtcdConnectionFailed): obj = etcd_utils.read("key") with patch.object(Client, "read", raise_etcdkeynotfound) as mock_read: with pytest.raises(etcd.EtcdKeyNotFound): obj = etcd_utils.read("key")
def get_cluster_details(): ''' To get details of glusters from etcd TODO: Optimize the code, reduce number of etcd calls TODO: Extract etcd host and port from configuration file ''' cluster_details_list = [] try: result = etcd_utils.read('/clusters') for item in result.leaves: cluster_obj = cluster_detail.ClusterDetail() cluster_obj.integration_id = item.key.split('/')[-1] client_str = '/clusters/' + str(cluster_obj.integration_id) cluster_details = etcd_utils.read(client_str) for cluster in cluster_details.leaves: if 'Volumes' in cluster.key: volumes = etcd_utils.read(client_str + "/Volumes") for volume in volumes.leaves: volume_id = volume.key.split('/')[-1] volume_details = etcd_utils.read(client_str + "/Volumes/" + str(volume_id)) vol_dict = maps.NamedDict() for vol in volume_details.leaves: if "name" in vol.key: vol_dict.volume_name = vol.value if "Bricks" in vol.key: subvolume_details = etcd_utils.read( client_str + "/Volumes/" + str(volume_id) + "/Bricks") vol_dict.bricks = [] for subvolume in subvolume_details.leaves: brick_details = etcd_utils.read( client_str + "/Volumes/" + str(volume_id) + "/Bricks/" + str(subvolume.key.split('/')[-1])) for brick in brick_details.leaves: vol_dict.bricks.append( brick.key.split('/')[-1]) cluster_obj.volumes.append(vol_dict) if 'nodes' in cluster.key: nodes = etcd_utils.read(client_str + "/nodes") for node in nodes.leaves: node_id = node.key.split('/')[-1] node_details = etcd_utils.read(client_str + "/nodes/" + str(node_id) + "/NodeContext") for row in node_details.leaves: if "fqdn" in row.key: cluster_obj.hosts.append(row.value) cluster_details_list.append(cluster_obj) return cluster_details_list except (etcd.EtcdKeyNotFound, KeyError) as ex: logger.log("error", NS.get("publisher_id", None), {'message': str(ex)}) return None
def _enable_disable_volume_profiling(self): cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() volumes = NS.gluster.objects.Volume().load_all() or [] # Enable / disable based on cluster flag volume_profiling_flag # should be done only once while first sync. Later the volume # level volume_profiling_state should be set based on individual # volume level values first_sync_done = etcd_utils.read( "/clusters/%s/nodes/%s/NodeContext/first_sync_done" % (NS.tendrl_context.integration_id, NS.node_context.node_id)).value if first_sync_done in [None, "no", ""]: failed_vols = [] if cluster.volume_profiling_flag == "enable": for volume in volumes: if volume.profiling_enabled == "yes": continue out, err, rc = cmd_utils.Command( "gluster volume profile %s start" % volume.name).run() if (err or rc != 0) and \ "already started" in err: failed_vols.append(volume.name) if len(failed_vols) > 0: logger.log( "debug", NS.publisher_id, { "message": "Profiling already " "enabled for volumes: %s" % str(failed_vols) }) cluster.volume_profiling_state = "enabled" if cluster.volume_profiling_flag == "disable": for volume in volumes: if volume.profiling_enabled == "no": continue out, err, rc = cmd_utils.Command( "gluster volume profile %s stop" % volume.name).run() if (err or rc != 0) and \ "not started" in err: failed_vols.append(volume.name) if len(failed_vols) > 0: logger.log( "debug", NS.publisher_id, { "message": "Profiling not " "enabled for volumes: %s" % str(failed_vols) }) cluster.volume_profiling_state = "disabled" profiling_enabled_count = 0 for volume in volumes: if volume.profiling_enabled == "yes": profiling_enabled_count += 1 if profiling_enabled_count == 0: cluster.volume_profiling_state = "disabled" elif profiling_enabled_count == len(volumes): cluster.volume_profiling_state = "enabled" elif profiling_enabled_count < len(volumes): cluster.volume_profiling_state = "mixed" cluster.save()
def get_object_from_central_store(self, resource_key, obj_attr): attr_details = etcd_utils.read(resource_key) resource_details = {"details": []} for attr_detail in attr_details.leaves: resource_detail = {} attr_key = attr_detail.key.rsplit("/", 1)[1] for key, value in obj_attr["attrs"].items(): sub_attr = etcd_utils.read( os.path.join(resource_key, attr_key, key)) resource_detail[key] = sub_attr.value resource_details["details"].append(resource_detail) try: if obj_attr["count"]: resource_details = self.get_resource_count( resource_details, obj_attr) except KeyError: pass return resource_details
def run(self): try: all_node_status_up = True # check job is parent or child job = NS.tendrl.objects.Job( job_id=self.parameters['job_id']).load() if "parent" not in job.payload: # fetch node id using integration_id integration_id = self.parameters[ 'TendrlContext.integration_id'] key = "indexes/tags/tendrl/integration/%s" % \ integration_id node_ids_str = etcd_utils.read(key).value node_ids = json.loads(node_ids_str) # identifying node status using node_id logger.log( "info", NS.publisher_id, {"message": "Checking status of nodes %s" % str(node_ids)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) nodes_up = [] nodes_down = [] for node in node_ids: node = str(node) # if node_context not found it will give status DOWN node_context = NS.tendrl.objects.NodeContext( node_id=node, status='DOWN').load() if node_context.status == "UP": nodes_up.append(node) else: all_node_status_up = False nodes_down.append(node) if all_node_status_up: logger.log("info", NS.publisher_id, {"message": "Nodes %s are up" % nodes_up}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) else: logger.log("info", NS.publisher_id, {"message": "Nodes %s are down" % nodes_down}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) # no need to check for child job return all_node_status_up except (etcd.EtcdKeyNotFound, TypeError) as ex: logger.log( "error", NS.get("publisher_id", None), { "message": "Error checking status of nodes. Error: %s" % str(ex) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id']) return False
def run(self): aggregate_gluster_objects = NS.monitoring.definitions.\ get_parsed_defs()["namespace.monitoring"]["graphite_data"] _sleep = 0 while not self._complete.is_set(): # update monitoring tag in each sync NS.node_context = NS.node_context.load() current_tags = list(NS.node_context.tags) if "tendrl/integration/monitoring" not in current_tags: current_tags += ["tendrl/integration/monitoring"] NS.node_context.tags = list(set(current_tags)) NS.node_context.save() if self.sync_interval is None: try: config_data = json.loads( etcd_utils.read("_NS/gluster/config/data").value) try: self.sync_interval = int( config_data['data']['sync_interval']) except ValueError as ex: logger.log( "error", NS.get("publisher_id", None), { 'message': "Unable to parse tendrl-gluster-" "integration config 'sync_interval'" }) raise ex except etcd.EtcdKeyNotFound as ex: # Before cluster import sync_interval is not populated time.sleep(DEFAULT_SLEEP) continue if _sleep > 5: _sleep = self.sync_interval else: _sleep += 1 try: cluster_details = self.plugin_obj.get_central_store_data( aggregate_gluster_objects) graphite_utils.create_cluster_alias(cluster_details) metrics = graphite_utils.create_metrics( aggregate_gluster_objects, cluster_details) metric_list = [] for metric in metrics: for key, value in metric.items(): if value: metric_list.append("tendrl.%s %s %d" % (key, value, int(time.time()))) self.plugin_obj.push_metrics(metric_list) # Creating or refreshing alert dashboard if _sleep > 5: SyncAlertDashboard().refresh_dashboard() time.sleep(_sleep) except (etcd.EtcdKeyNotFound, AttributeError, KeyError) as ex: logger.log("error", NS.get("publisher_id", None), {'message': str(ex)}) time.sleep(_sleep)
def test_read(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.client = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.reconnect = type("Dummy", (object,), {}) with patch.object(Client, "read", return_value="test") as mock_read: obj = etcd_utils.read("key") assert obj == "test" assert mock_read.assert_called with patch.object(Client, "read", raise_etcdconnectionfailed) as mock_read: with pytest.raises(etcd.EtcdConnectionFailed): obj = etcd_utils.read("key") with patch.object(Client, "read", raise_etcdkeynotfound) as mock_read: with pytest.raises(etcd.EtcdKeyNotFound): obj = etcd_utils.read("key")
def run(): try: nodes = etcd_utils.read("/nodes") for node in nodes.leaves: node_id = node.key.split("/")[-1] _node_context = NS.tendrl.objects.NodeContext( node_id=node_id ).load() if _node_context.fqdn: _node_context.watch_attrs() except etcd.EtcdKeyNotFound: pass return
def _sync_cluster_network_details(self): try: etcd_utils.read("clusters/%s/cluster_network" % NS.tendrl_context.integration_id) except etcd.EtcdKeyNotFound: try: cluster_config = NS.ceph.objects.SyncObject( sync_type='config').load().data cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() cluster.public_network = cluster_config['public_network'] cluster.cluster_network = cluster_config['cluster_network'] cluster.save() except etcd.EtcdKeyNotFound as ex: Event( Message(priority="error", publisher=NS.publisher_id, payload={ 'message': "Failed to sync cluster network details" })) raise ex
def on_change(self, attr, prev_value, current_value): if attr == "status": if current_value is None: self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event("node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id) _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id).load() _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) _msg = "node_sync, STALE provisioner node "\ "found! re-configuring monitoring "\ "(job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % self.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': _tc.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) if _tc.sds_name == "gluster": bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn)) for brick in bricks.leaves: try: etcd_utils.write("{0}/status".format(brick.key), "Stopped") except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass
def find_volume_name(integration_id, hostname, brick_path): try: vol_name = etcd_utils.read( "clusters/%s/Bricks/all/%s/%s/vol_name" % (integration_id, hostname, brick_path)).value return vol_name except EtcdKeyNotFound as ex: logger.log( "debug", NS.publisher_id, { "message": "Unable to find volume name for brick" " %s:%s" % (hostname, brick_path) }) raise ex
def aggregate_session_status(): volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id).load_all() georep_status = GeoReplicationSessionStatus() if volumes: for volume in volumes: vol_id = volume.vol_id sessions = None try: sessions = etcd_utils.read( "clusters/%s/Volumes/%s/GeoRepSessions" % (NS.tendrl_context.integration_id, vol_id)) except etcd.EtcdKeyNotFound: continue pair_count = int(volume.brick_count) for session in sessions.leaves: session_status = None session_id = session.key.split("GeoRepSessions/")[-1] pairs = NS.gluster.objects.GeoReplicationPair( vol_id=vol_id, session_id=session_id).load_all() faulty_count = 0 stopped_count = 0 paused_count = 0 created_count = 0 for pair in pairs: if pair.status.lower() == "faulty": faulty_count += 1 elif pair.status.lower() == "created": created_count += 1 elif pair.status.lower() == "stopped": stopped_count += 1 elif pair.status.lower() == "paused": paused_count += 1 if created_count == pair_count: session_status = georep_status.CREATED elif faulty_count == 0 and (stopped_count == 0 and paused_count == 0 and created_count == 0): session_status = georep_status.UP elif pair_count == faulty_count: session_status = georep_status.DOWN elif stopped_count == pair_count: session_status = georep_status.STOPPED elif paused_count == pair_count: session_status = georep_status.PAUSED else: session_status = georep_status.PARTIAL NS.tendrl.objects.GeoReplicationSession( vol_id=vol_id, session_id=session_id, session_status=session_status).save()
def load_all(self): ins = [] try: self.render() value = '/'.join(self.value.split('/')[:-1]) etcd_resp = etcd_utils.read(value) for item in etcd_resp.leaves: # When directory is not empty then NS._int.client.read(key) # will return key + directory id as new key. If directory is # empty then it will return key only. When directory is # not present then it will raise EtcdKeyNotFound if item.key.strip("/") != value.strip("/"): # if dir is empty then item.key and value is same self.value = item.key ins.append(self.load()) except etcd.EtcdKeyNotFound: pass return ins
def hash_compare_with_central_store(self, ttl=None): try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = etcd_utils.read(_hash_key).value except etcd.EtcdKeyNotFound: return False if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return True else: return False except TypeError: # no hash for this object, save the current hash as is return False
def load(self): self.render() _copy = self._copy_vars() # Check if self.value already set, use it if self.value.find('{') < 0: _copy.value = self.value if "Message" not in _copy.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(): return _copy key = _copy.value + '/data' try: val_str = etcd_utils.read(key).value except etcd.EtcdKeyNotFound: return _copy loc_dict = json.loads(val_str) for attr_name, attr_val in vars(_copy).iteritems(): _type = self._defs.get("attrs", {}).get( attr_name, {} ).get("type") if loc_dict.get(attr_name) in [None, ""]: if _type and _type.lower() == 'list': setattr(_copy, attr_name, list()) if _type and _type.lower() == 'json': setattr(_copy, attr_name, dict()) else: if _type and _type.lower() in ['list']: setattr( _copy, attr_name, json.loads(loc_dict[attr_name]) ) else: setattr(_copy, attr_name, loc_dict[attr_name]) return _copy
def __init__(self, node_id=None, fqdn=None, ipv4_addr=None, tags=None, status=None, sync_status=None, last_sync=None, pkey=None, locked_by=None, *args, **kwargs): super(NodeContext, self).__init__(*args, **kwargs) self.node_id = node_id or self._get_node_id() or self._create_node_id() self.fqdn = fqdn self.ipv4_addr = ipv4_addr if self.fqdn: self.ipv4_addr = socket.gethostbyname(self.fqdn) self.locked_by = locked_by curr_tags = [] try: _nc_data = etcd_utils.read( "/nodes/%s/NodeContext/data" % self.node_id ).value curr_tags = json.loads(_nc_data)['tags'] except etcd.EtcdKeyNotFound: pass try: curr_tags = json.loads(curr_tags) except (ValueError, TypeError): # No existing tags pass self.tags = tags or [] self.tags += NS.config.data.get('tags', []) self.tags += curr_tags self.tags = list(set(self.tags)) self.status = status or "UP" self.sync_status = sync_status self.last_sync = last_sync self.pkey = pkey or self.fqdn self.value = 'nodes/{0}/NodeContext'
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") try: for brick in bricks: # find fqdn using ip ip = socket.gethostbyname(brick.split(":/")[0]) node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn brick = fqdn + ":" + brick.split(":")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_') ).load() # delete brick etcd_utils.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_') ), recursive=True, ) # delete alert dashbaord job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } ) # delete brick details from graphite job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id } ) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info etcd_utils.delete( volume_brick_path, recursive=True ) _trigger_sync_key = 'clusters/%s/_sync_now' % \ NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Unable to delete bricks %s" % bricks } )
def sync_volumes( volumes, index, vol_options, sync_ttl, cluster_short_name, devicetree, lvs ): NS.node_context = NS.tendrl.objects.NodeContext().load() tag_list = NS.node_context.tags # Raise alerts for volume state change. cluster_provisioner = "provisioner/%s" % NS.tendrl_context.integration_id if cluster_provisioner in tag_list: try: _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() if _volume.locked_by and 'job_id' in _volume.locked_by and \ _volume.current_job.get('status', '') == 'in_progress': # There is a job active on volume. skip the sync return stored_volume_status = _volume.status current_status = volumes['volume%s.status' % index] if stored_volume_status not in [None, ""] and \ current_status != stored_volume_status: msg = ("Status of volume: %s in cluster %s " "changed from %s to %s") % ( volumes['volume%s.name' % index], cluster_short_name, stored_volume_status, current_status) instance = "volume_%s" % volumes[ 'volume%s.name' % index ] event_utils.emit_event( "volume_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volumes['volume%s.name' % index] } ) except (KeyError, etcd.EtcdKeyNotFound) as ex: if isinstance(ex, KeyError): raise ex pass volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume.vol_type = "arbiter" \ if int(volumes['volume%s.arbiter_count' % index]) > 0 \ else volumes['volume%s.type' % index] volume.name = volumes['volume%s.name' % index] volume.transport_type = volumes['volume%s.transport_type' % index] volume.status = volumes['volume%s.status' % index] volume.brick_count = volumes['volume%s.brickcount' % index] volume.snap_count = volumes['volume%s.snap_count' % index] volume.stripe_count = volumes['volume%s.stripe_count' % index] volume.replica_count = volumes['volume%s.replica_count' % index] volume.subvol_count = volumes['volume%s.subvol_count' % index] volume.arbiter_count = volumes['volume%s.arbiter_count' % index] volume.disperse_count = volumes['volume%s.disperse_count' % index] volume.redundancy_count = volumes['volume%s.redundancy_count' % index] volume.quorum_status = volumes['volume%s.quorum_status' % index] volume.snapd_status = volumes[ 'volume%s.snapd_svc.online_status' % index] volume.snapd_inited = volumes['volume%s.snapd_svc.inited' % index] if NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).exists(): existing_vol = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=volumes['volume%s.id' % index] ).load() volume_profiling_old_value = existing_vol.profiling_enabled else: volume_profiling_old_value = volume.profiling_enabled if ('volume%s.profile_enabled' % index) in volumes: value = int(volumes['volume%s.profile_enabled' % index]) if value == 1: volume_profiling_new_value = "yes" else: volume_profiling_new_value = "no" else: volume_profiling_new_value = None volume.profiling_enabled = volume_profiling_new_value if volume_profiling_old_value not in [None, ""] and \ volume_profiling_old_value != volume_profiling_new_value: # Raise alert for the same value change msg = ("Value of volume profiling for volume: %s " "of cluster %s changed from %s to %s" % ( volumes['volume%s.name' % index], cluster_short_name, volume_profiling_old_value, volume_profiling_new_value)) instance = "volume_%s" % \ volumes['volume%s.name' % index] event_utils.emit_event( "volume_profiling_status", volume_profiling_new_value, msg, instance, 'INFO', tags={ "entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.name' % index ] } ) volume.save(ttl=sync_ttl) # Save the default values of volume options vol_opt_dict = {} for opt_count in \ range(1, int(vol_options['volume%s.options.count' % index])): vol_opt_dict[ vol_options[ 'volume%s.options.key%s' % (index, opt_count) ] ] = vol_options[ 'volume%s.options.value%s' % (index, opt_count) ] volume.options = vol_opt_dict volume.save() rebal_det = NS.gluster.objects.RebalanceDetails( vol_id=volumes['volume%s.id' % index], rebal_id=volumes['volume%s.rebalance.id' % index], rebal_status=volumes['volume%s.rebalance.status' % index], rebal_failures=volumes['volume%s.rebalance.failures' % index], rebal_skipped=volumes['volume%s.rebalance.skipped' % index], rebal_lookedup=volumes['volume%s.rebalance.lookedup' % index], rebal_files=volumes['volume%s.rebalance.files' % index], rebal_data=volumes['volume%s.rebalance.data' % index], time_left=volumes.get('volume%s.rebalance.time_left' % index), ) rebal_det.save(ttl=sync_ttl) georep_details.save_georep_details(volumes, index) b_index = 1 # ipv4 address of current node try: network_ip = [] networks = NS.tendrl.objects.NodeNetwork().load_all() for network in networks: if network.ipv4: network_ip.extend(network.ipv4) except etcd.EtcdKeyNotFound as ex: Event( ExceptionMessage( priority="debug", publisher=NS.publisher_id, payload={ "message": "Could not find " "any ipv4 networks for node" " %s" % NS.node_context.node_id, "exception": ex } ) ) while True: try: # Update brick node wise hostname = volumes[ 'volume%s.brick%s.hostname' % (index, b_index) ] ip = socket.gethostbyname(hostname) try: node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn cluster_node_ids = etcd_utils.read( "indexes/tags/tendrl/integration/%s" % NS.tendrl_context.integration_id ).value cluster_node_ids = json.loads(cluster_node_ids) if NS.node_context.fqdn != fqdn or \ node_id not in cluster_node_ids: b_index += 1 continue except(TypeError, etcd.EtcdKeyNotFound): b_index += 1 continue sub_vol_size = (int( volumes['volume%s.brickcount' % index] )) / int( volumes['volume%s.subvol_count' % index] ) brick_name = NS.node_context.fqdn brick_name += ":" brick_name += volumes['volume%s.brick%s' '.path' % ( index, b_index )].split(":")[-1].replace("/", "_") # Raise alerts if the brick path changes try: stored_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() current_status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) if stored_brick.status and \ current_status != stored_brick.status: msg = ("Brick:%s in volume:%s has %s" ) % ( volumes['volume%s.brick%s' '.path' % ( index, b_index )], volumes['volume%s.' 'name' % index], current_status) instance = "volume_%s|brick_%s" % ( volumes['volume%s.name' % index], volumes['volume%s.brick%s.path' % ( index, b_index )] ) event_utils.emit_event( "brick_status", current_status, msg, instance, 'WARNING' if current_status == 'Stopped' else 'INFO', tags={"entity_type": RESOURCE_TYPE_BRICK, "volume_name": volumes[ 'volume%s.' 'name' % index] } ) except etcd.EtcdKeyNotFound: pass brk_pth = "clusters/%s/Volumes/%s/Bricks/subvolume%s/%s" vol_brick_path = brk_pth % ( NS.tendrl_context.integration_id, volumes['volume%s.id' % index], str((b_index - 1) / sub_vol_size), brick_name ) etcd_utils.write(vol_brick_path, "") brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1] ).load() brick.integration_id = NS.tendrl_context.integration_id brick.fqdn = NS.node_context.fqdn brick.brick_dir = brick_name.split(":_")[-1] brick.name = brick_name brick.vol_id = volumes['volume%s.id' % index] brick.sequence_number = b_index brick.brick_path = volumes[ 'volume%s.brick%s.path' % (index, b_index) ] brick.hostname = volumes.get( 'volume%s.brick%s.hostname' % (index, b_index) ) brick.port = volumes.get( 'volume%s.brick%s.port' % (index, b_index) ) brick.vol_name = volumes['volume%s.name' % index] brick.used = True brick.node_id = NS.node_context.node_id brick.status = volumes.get( 'volume%s.brick%s.status' % (index, b_index) ) brick.filesystem_type = volumes.get( 'volume%s.brick%s.filesystem_type' % (index, b_index) ) brick.mount_opts = volumes.get( 'volume%s.brick%s.mount_options' % (index, b_index) ) brick.utilization = brick_utilization.brick_utilization( volumes['volume%s.brick%s.path' % (index, b_index)], lvs ) brick.client_count = volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) brick.is_arbiter = volumes.get( 'volume%s.brick%s.is_arbiter' % (index, b_index) ) brick.save(ttl=sync_ttl) # sync brick device details brick_device_details.\ update_brick_device_details( brick_name, volumes[ 'volume%s.brick%s.path' % ( index, b_index) ], devicetree, sync_ttl ) # Sync the brick client details c_index = 1 if volumes.get( 'volume%s.brick%s.client_count' % (index, b_index) ) > 0: while True: try: NS.gluster.objects.ClientConnection( brick_name=brick_name, fqdn=NS.node_context.fqdn, brick_dir=brick_name.split(":_")[-1], hostname=volumes[ 'volume%s.brick%s.client%s.hostname' % ( index, b_index, c_index ) ], bytesread=volumes[ 'volume%s.brick%s.client%s.bytesread' % ( index, b_index, c_index ) ], byteswrite=volumes[ 'volume%s.brick%s.client%s.byteswrite' % ( index, b_index, c_index ) ], opversion=volumes[ 'volume%s.brick%s.client%s.opversion' % ( index, b_index, c_index ) ] ).save(ttl=sync_ttl) except KeyError: break c_index += 1 sync_ttl += 4 b_index += 1 except KeyError: break return b_index
def update_cluster_details(self, integration_id): try: nodes = etcd_utils.read( "/clusters/%s/nodes" % integration_id ) for node in nodes.leaves: _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=node.key.split("/")[-1], integration_id=integration_id ).load() # Verify all nodes in a cluster are down if str(_cnc.status).lower() != "down" and \ str(_cnc.is_managed).lower() == "yes": # Any one managed node not down don't update # cluster details, No need to consider unmanaged # nodes return # when all managed nodes are down update cluster details global_details = NS.tendrl.objects.GlobalDetails( integration_id=integration_id ).load() # Update cluster as unhealthy if global_details.status.lower() == "healthy": global_details.status = "unhealthy" global_details.save() _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() msg = "Cluster:%s is %s" % ( _cluster.short_name, "unhealthy") instance = "cluster_%s" % integration_id event_utils.emit_event( "cluster_health_status", "unhealthy", msg, instance, 'WARNING', integration_id=integration_id ) # Update all bricks are down nodes = etcd_utils.read( "/clusters/%s/Bricks/all" % integration_id ) for node in nodes.leaves: bricks = NS.tendrl.objects.GlusterBrick( integration_id, fqdn=node.key.split("/")[-1] ).load_all() for brick in bricks: if brick.status.lower() != "stopped": brick.status = "Stopped" brick.save() msg = ("Brick:%s in volume:%s has %s") % ( brick.brick_path, brick.vol_name, "Stopped" ) instance = "volume_%s|brick_%s" % ( brick.vol_name, brick.brick_path ) event_utils.emit_event( "brick_status", "Stopped", msg, instance, "WARNING", integration_id=integration_id, tags={"entity_type": "brick", "volume_name": brick.vol_name, "node_id": brick.node_id } ) # Update all volumes are down volumes = NS.tendrl.objects.GlusterVolume( integration_id ).load_all() for volume in volumes: if volume.state.lower() != "down": volume.state = "down" volume.status = "Stopped" volume.save() msg = "Volume:%s is %s" % (volume.name, "down") instance = "volume_%s" % volume.name event_utils.emit_event( "volume_state", "down", msg, instance, "WARNING", integration_id=integration_id, tags={"entity_type": "volume", "volume_name": volume.name } ) except etcd.EtcdKeyNotFound: pass
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and ( _cluster.current_job['status'] == 'failed')) or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": time.sleep(_sleep) continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: # create devicetree using lsblk devicetree = get_device_tree() # find lvs lvs = brick_utilization.get_lvs() index = 1 volumes = raw_data['Volumes'] total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree, lvs ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._update_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def run(self): if "Node[]" not in self.parameters: integration_id = self.parameters['TendrlContext.integration_id'] short_name = self.parameters.get('Cluster.short_name', None) if short_name: if not re.match('^[a-zA-Z0-9][A-Za-z0-9_]*$', short_name) or \ len(short_name) > 64: raise FlowExecutionFailedError( "Invalid cluster short_name: %s. " "Only alpha-numeric and underscore " "allowed for short name, max length 64 chars" % short_name ) # Check for uniqueness of cluster short name _clusters = NS._int.client.read( '/clusters' ) for entry in _clusters.leaves: _cluster = NS.tendrl.objects.Cluster( integration_id=entry.key.split('/')[-1] ).load() if _cluster.short_name and short_name and \ _cluster.is_managed == 'yes' and \ _cluster.short_name == short_name.strip().lower(): raise FlowExecutionFailedError( "Cluster with name: %s already exists" % short_name ) _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() if (_cluster.status is not None and _cluster.status != "" and _cluster.current_job['status'] == 'in_progress' and _cluster.status in ["importing", "unmanaging", "expanding"]): raise FlowExecutionFailedError( "Another job in progress for cluster, please wait till " "the job finishes (job_id: %s) (integration_id: %s) " % ( _cluster.current_job['job_id'], _cluster.integration_id ) ) if short_name not in [None, ""]: _cluster.short_name = short_name else: _cluster.short_name = integration_id _cluster.status = "importing" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'in_progress' } _cluster.save() try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id _node_ids = etcd_utils.read( integration_id_index_key).value self.parameters["Node[]"] = json.loads(_node_ids) except etcd.EtcdKeyNotFound: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() _cluster.status = "" _cluster.current_job['status'] = 'failed' _cluster.save() raise FlowExecutionFailedError("Cluster with " "integration_id " "(%s) not found, cannot " "import" % integration_id) else: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() _cluster.volume_profiling_flag = self.parameters[ 'Cluster.volume_profiling_flag'] _cluster.save() try: super(ImportCluster, self).run() # Check if this job is parent and then only set status # This could be called from parent import cluster or # even from expand cluster flow. We should not set the # cluster's current job status from child jobs _job = NS.tendrl.objects.Job(job_id=self.job_id).load() if 'parent' not in _job.payload and _job.status != "failed": _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() _cluster.status = "" _cluster.current_job['status'] = "finished" _cluster.is_managed = "yes" _cluster.save() except (FlowExecutionFailedError, AtomExecutionFailedError, Exception) as ex: exc_type, exc_value, exc_traceback = sys.exc_info() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() _cluster.status = "" _cluster.current_job['status'] = 'failed' _errors = [] if hasattr(ex, 'message'): _errors = [ex.message] else: _errors = [str(ex)] if _errors: _cluster.errors = _errors _cluster.save() raise FlowExecutionFailedError(str( traceback.format_exception(exc_type, exc_value, exc_traceback) ))
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() if _cluster.status is not None and _cluster.status != "" and \ _cluster.status in ["importing", "unmanaging", "expanding"]: raise FlowExecutionFailedError( "Another job in progress for cluster, please wait till " "the job finishes (job_id: %s) (integration_id: %s) " % ( _cluster.current_job['job_id'], integration_id ) ) _lock_details = { 'node_id': NS.node_context.node_id, 'fqdn': NS.node_context.fqdn, 'tags': NS.node_context.tags, 'type': NS.type, 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.locked_by = _lock_details _cluster.status = "expanding" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'in_progress' } _cluster.save() try: integration_id_index_key = \ "indexes/tags/tendrl/integration/%s" % integration_id node_ids = etcd_utils.read( integration_id_index_key).value node_ids = json.loads(node_ids) except etcd.EtcdKeyNotFound: _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.locked_by = {} _cluster.status = "expand_pending" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'failed' } _cluster.save() raise FlowExecutionFailedError( "Cluster with integration_id " "(%s) not found, cannot " "import" % integration_id ) job_ids = [] new_peers = [] # Remove the current node from list as its already participating # in cluster for sure node_ids.remove(NS.node_context.node_id) for node_id in node_ids: _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load() if _cnc.is_managed not in [None, ""] \ and _cnc.is_managed.lower() == "yes": continue params = { 'TendrlContext.integration_id': integration_id, 'Node[]': [node_id], 'Cluster.volume_profiling_flag': _cluster.volume_profiling_flag } payload = { "tags": ["tendrl/node_%s" % node_id], "run": "tendrl.flows.ImportCluster", "status": "new", "parent": self.parameters['job_id'], "parameters": params, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job( job_id=_job_id, status="new", payload=payload ).save() logger.log( "info", NS.publisher_id, { "message": "ImportCluster %s (jobID: %s) : " "importing host %s" % ( _cluster.short_name, _job_id, node_id ) }, job_id=self.parameters['job_id'] ) job_ids.append(_job_id) new_peers.append(node_id) loop_count = 0 # Wait for (no of nodes) * 6 minutes for import to complete wait_count = len(job_ids) * 36 while True: child_jobs_failed = [] if loop_count >= wait_count: logger.log( "info", NS.publisher_id, { "message": "Import jobs not yet complete " "on all new nodes %s on cluster %s. Timing out. " % (str(node_ids), _cluster.short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.locked_by = {} _cluster.status = "expand_pending" _cluster.current_job = { 'job_id': self.job_id, 'job_name': self.__class__.__name__, 'status': 'failed' } _cluster.save() raise FlowExecutionFailedError( "Failed to expand cluster with integration_id " "(%s)" % integration_id ) time.sleep(10) finished = True for job_id in job_ids: job = NS.tendrl.objects.Job(job_id=job_id).load() if job.status not in ["finished", "failed"]: finished = False elif job.status == "failed": child_jobs_failed.append(job.job_id) if finished: break else: loop_count += 1 continue if len(child_jobs_failed) > 0: _msg = "Child jobs failed are %s" % child_jobs_failed logger.log( "error", NS.publisher_id, {"message": _msg}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.status = "expand_pending" _cluster.locked_by = {} _cluster.current_job = { 'status': "failed", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.save() raise FlowExecutionFailedError( "Failed to expand cluster with integration_id " "(%s)" % integration_id ) _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() _cluster.status = "" _cluster.locked_by = {} _cluster.current_job = { 'status': "finished", 'job_name': self.__class__.__name__, 'job_id': self.job_id } _cluster.save() logger.log( "info", NS.publisher_id, { "message": "Newly detected nodes: %s added to the " "cluster %s)" % ( str(new_peers), _cluster.short_name ), }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return True
def run(self): logger.log( "info", NS.publisher_id, { "message": "Deleting cluster details." }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) integration_id = self.parameters['TendrlContext.integration_id'] etcd_keys_to_delete = [] etcd_keys_to_delete.append( "/clusters/%s/nodes" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/Bricks" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/Volumes" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/GlobalDetails" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/TendrlContext" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/Utilization" % integration_id ) etcd_keys_to_delete.append( "/clusters/%s/raw_map" % integration_id ) etcd_keys_to_delete.append( "/alerting/clusters/%s" % integration_id ) nodes = etcd_utils.read( "/clusters/%s/nodes" % integration_id ) node_ids = [] for node in nodes.leaves: node_id = node.key.split("/")[-1] node_ids.append(node_id) key = "/alerting/nodes/%s" % node_id etcd_keys_to_delete.append( key ) try: # delete node alerts from /alerting/alerts node_alerts = etcd_utils.read(key) for node_alert in node_alerts.leaves: etcd_keys_to_delete.append( "/alerting/alerts/%s" % node_alert.key.split( "/")[-1] ) except etcd.EtcdKeyNotFound: # No node alerts, continue pass # Find the alerting/alerts entries to be deleted try: cluster_alert_ids = etcd_utils.read( "/alerting/clusters/%s" % integration_id ) for entry in cluster_alert_ids.leaves: ca_id = entry.key.split("/")[-1] etcd_keys_to_delete.append( "/alerting/alerts/%s" % ca_id ) except etcd.EtcdKeyNotFound: # No cluster alerts, continue pass # Remove the cluster details for key in list(set(etcd_keys_to_delete)): try: etcd_utils.delete(key, recursive=True) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "%s key not found for deletion" % key }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) continue # remove short name cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() cluster.short_name = "" cluster.save() return True
def snapshot_restored(self, event): time.sleep(self.sync_interval) message = event["message"] volume = message['volume_name'] volume_id = "" bricks_to_remove = [] # get the list of current bricks by running get-state output_dir = '/var/run/' output_file = 'glusterd-state-snapshot-%s' % str(uuid.uuid4()) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', output_dir, 'file', output_file, 'detail' ] ) raw_data = ini2json.ini_to_dict( output_dir + output_file ) subprocess.call(['rm', '-rf', output_dir + output_file]) index = 1 while True: try: current_vol = 'volume%s.name' % index if raw_data['Volumes'][current_vol] == volume: current_vol_id = 'volume%s.id' % index volume_id = raw_data['Volumes'][current_vol_id] break except KeyError: return index += 1 latest_bricks = [] b_index = 1 while True: try: curr_brick = 'volume%s.brick%s.path' % ( index, b_index ) brick = raw_data['Volumes'][curr_brick] b_index += 1 except KeyError: break latest_bricks.append(brick) # get the list of bricks in etcd for this volume sub_volumes = etcd_utils.read( "/clusters/{0}/Volumes/{1}/Bricks".format( NS.tendrl_context.integration_id, volume_id ) ) for sub_volume in sub_volumes.leaves: bricks = etcd_utils.read( sub_volume.key ) for brick in bricks.leaves: fqdn = brick.key.split('/')[-1].split(':')[0] path = brick.key.split('/')[-1].split(':')[-1][1:] brick_path = "clusters/{0}/Bricks/"\ "all/{1}/{2}".format( NS.tendrl_context.integration_id, fqdn, path ) brick_full_path = etcd_utils.read( "%s/brick_path" % brick_path ).value if brick_full_path not in latest_bricks: bricks_to_remove.append(brick_full_path) brick_details = {} brick_details["volume"] = volume brick_details["bricks"] = " ".join(bricks_to_remove) event["message"] = brick_details self.volume_remove_brick_force(event)
def run(self): node_ids = self.parameters.get('Node[]') if not node_ids or len(node_ids) == 0: raise AtomExecutionFailedError("Node[] cannot be empty") for node_id in node_ids: # Check if node has the OS details populated try: os_details = etcd_utils.read("nodes/%s/Os" % node_id) if os_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have OS details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have OS details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has the CPU details populated try: cpu_details = etcd_utils.read("nodes/%s/Cpu" % node_id) if cpu_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have CPU details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have CPU details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has the Memory populated try: memory_details = etcd_utils.read( "nodes/%s/Memory" % node_id ) if memory_details.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have Memory details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have Memory details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False # Check if node has networks details populated try: networks = etcd_utils.read("nodes/%s/Networks" % node_id) if networks.leaves is None: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have network details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: logger.log( "error", NS.get("publisher_id", None), { "message": "Node %s doesn't have network details " "populated" % NS.node_context.fqdn }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False return True
def run(self): integration_id = self.parameters['TendrlContext.integration_id'] _cluster = NS.tendrl.objects.Cluster( integration_id=integration_id ).load() try: # Get the cluster nodes nodes = etcd_utils.read("/clusters/%s/nodes" % integration_id) child_job_ids = [] node_ids = [] for node in nodes.leaves: node_id = node.key.split("/")[-1] node_ids.append(node_id) # Create jobs on nodes for stoping services _job_id = str(uuid.uuid4()) params = { "Services[]": ["tendrl-gluster-integration"] } payload = { "tags": ["tendrl/node_%s" % node_id], "run": "tendrl.objects.Node.flows.StopServices", "status": "new", "parameters": params, "parent": self.parameters["job_id"], "type": "node" } NS.tendrl.objects.Job( job_id=_job_id, status="new", payload=payload ).save() child_job_ids.append(_job_id) logger.log( "info", NS.publisher_id, { "message": "Stop tendrl services (job: %s) " "on %s in cluster %s" % (_job_id, node_id, _cluster.short_name) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) # Wait for (no of nodes) * 10 secs for stop service job to complete loop_count = 0 wait_count = (len(child_job_ids)) * 2 while True: child_jobs_failed = [] if loop_count >= wait_count: logger.log( "error", NS.publisher_id, { "message": "Stop service jobs on cluster(%s) not " "yet complete on all nodes(%s). Timing out. " % (_cluster.short_name, str(node_ids)) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'], ) # Marking child jobs as failed which did not complete as # the parent job has timed out. This has to be done # explicitly because these jobs will still be processed # by the node-agent, and will keep it busy, which might # defer the new jobs or lead to their timeout. for child_job_id in child_job_ids: child_job = NS.tendrl.objects.Job( job_id=child_job_id ).load() if child_job.status not in ["finished", "failed"]: child_job.status = "failed" child_job.save() return False time.sleep(5) finished = True for child_job_id in child_job_ids: child_job = NS.tendrl.objects.Job( job_id=child_job_id ).load() if child_job.status not in ["finished", "failed"]: finished = False elif child_job.status == "failed": child_jobs_failed.append(child_job.job_id) if finished: break else: loop_count += 1 continue if len(child_jobs_failed) > 0: _msg = "Child jobs failed are %s" % child_jobs_failed logger.log( "error", NS.publisher_id, {"message": _msg}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False except etcd.EtcdKeyNotFound: pass return True
def run(self): try: all_node_status_up = True # check job is parent or child job = NS.tendrl.objects.Job( job_id=self.parameters['job_id'] ).load() if "parent" not in job.payload: # fetch node id using integration_id integration_id = self.parameters[ 'TendrlContext.integration_id' ] key = "indexes/tags/tendrl/integration/%s" % \ integration_id node_ids_str = etcd_utils.read(key).value node_ids = json.loads(node_ids_str) # identifying node status using node_id logger.log( "info", NS.publisher_id, {"message": "Checking status of nodes %s" % str(node_ids)}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) nodes_up = [] nodes_down = [] for node in node_ids: node = str(node) # if node_context not found it will give status DOWN node_context = NS.tendrl.objects.NodeContext( node_id=node, status='DOWN' ).load() if node_context.status == "UP": nodes_up.append(node) else: all_node_status_up = False nodes_down.append(node) if all_node_status_up: logger.log( "info", NS.publisher_id, {"message": "Nodes %s are up" % nodes_up}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) else: logger.log( "info", NS.publisher_id, {"message": "Nodes %s are down" % nodes_down}, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) # no need to check for child job return all_node_status_up except (etcd.EtcdKeyNotFound, TypeError) as ex: logger.log( "error", NS.get("publisher_id", None), { "message": "Error checking status of nodes. Error: %s" % str(ex) }, job_id=self.parameters['job_id'], flow_id=self.parameters['flow_id'] ) return False
def volume_delete(self, event): time.sleep(self.sync_interval) fetched_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() for fetched_volume in fetched_volumes: if fetched_volume.name == event['message']['name']: fetched_volume.deleted = True fetched_volume.deleted_at = time_utils.now() fetched_volume.save() try: sub_volumes = etcd_utils.read( "/clusters/{0}/Volumes/{1}/Bricks".format( NS.tendrl_context.integration_id, fetched_volume.vol_id ) ) for sub_volume in sub_volumes.leaves: bricks = etcd_utils.read( sub_volume.key ) for brick in bricks.leaves: fqdn = brick.key.split('/')[-1].split(':')[0] path = brick.key.split('/')[-1].split(':')[-1][1:] # Delete brick dashboard from grafana brick_obj = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn, path ).load() # Delete brick brick_path = "clusters/{0}/Bricks/"\ "all/{1}/{2}".format( NS.tendrl_context.integration_id, fqdn, path ) etcd_utils.delete( brick_path, recursive=True ) brick_full_path = fqdn + ":" + brick_obj.\ brick_path.split(":")[-1] job_id = monitoring_utils.update_dashboard( "%s|%s" % ( event['message']['name'], brick_full_path ), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s" " for brick %s " "in cluster %s created" % ( job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id ) } ) # Delete brick from graphite job_id = monitoring_utils.\ delete_resource_from_graphite( "%s|%s" % ( event['message']['name'], brick_full_path ), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource " "from graphite job %s " "for brick %s in cluster %s created" % ( job_id, brick.key.split('/')[-1], NS.tendrl_context.integration_id ) } ) except etcd.EtcdKeyNotFound: pass # Delete volume dashboard from grafana job_id = monitoring_utils.update_dashboard( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } ) # Delete volume details from graphite job_id = monitoring_utils.delete_resource_from_graphite( event['message']['name'], RESOURCE_TYPE_VOLUME, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id } )
def on_change(self, attr, prev_value, current_value): if attr == "status" and "tendrl/monitor" in NS.node_context.tags: _tc = NS.tendrl.objects.TendrlContext( node_id=self.node_id ).load() # Check node is managed _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id ).load() if current_value is None and str(_cnc.is_managed).lower() == "yes": self.status = "DOWN" self.save() msg = "Node {0} is DOWN".format(self.fqdn) event_utils.emit_event( "node_status", self.status, msg, "node_{0}".format(self.fqdn), "WARNING", node_id=self.node_id, integration_id=_tc.integration_id ) # Load cluster_node_context will load node_context # and it will be updated with latest values _cnc_new = \ NS.tendrl.objects.ClusterNodeContext( node_id=self.node_id, integration_id=_tc.integration_id, first_sync_done=_cnc.first_sync_done, is_managed=_cnc.is_managed ) _cnc_new.save() del _cnc_new # Update cluster details self.update_cluster_details(_tc.integration_id) _tag = "provisioner/%s" % _tc.integration_id if _tag in self.tags: _index_key = "/indexes/tags/%s" % _tag self.tags.remove(_tag) self.save() etcd_utils.delete(_index_key) if _tc.sds_name in ["gluster", "RHGS"]: bricks = etcd_utils.read( "clusters/{0}/Bricks/all/{1}".format( _tc.integration_id, self.fqdn ) ) for brick in bricks.leaves: try: etcd_utils.write( "{0}/status".format(brick.key), "Stopped" ) except (etcd.EtcdAlreadyExist, etcd.EtcdKeyNotFound): pass elif current_value == "UP" and str( _cnc.is_managed).lower() == "yes": msg = "{0} is UP".format(self.fqdn) event_utils.emit_event( "node_status", "UP", msg, "node_{0}".format(self.fqdn), "INFO", node_id=self.node_id, integration_id=_tc.integration_id ) del _cnc
def _derive_volume_states(volumes): out_dict = {} for volume in volumes: if volume.status == "Stopped": out_dict[volume.vol_id] = "down" else: subvol_count = 0 bricks = [] subvol_states = [] while True: try: subvol = etcd_utils.read( "clusters/%s/Volumes/%s/Bricks/subvolume%s" % ( NS.tendrl_context.integration_id, volume.vol_id, subvol_count ) ) state = 0 for entry in subvol.leaves: brick_name = entry.key.split("/")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, brick_name.split(":")[0], brick_name.split(":_")[-1] ).load() if not fetched_brick.status: fetched_brick.status = "Stopped" bricks.append(fetched_brick) if fetched_brick.status != "Started": state += 1 subvol_states.append(state) subvol_count += 1 except etcd.EtcdKeyNotFound: break total_bricks = len(bricks) up_bricks = 0 for brick in bricks: if brick.status == "Started": up_bricks += 1 if total_bricks == 0 or total_bricks < int(volume.brick_count): # No brick details updated for the volume yet out_dict[volume.vol_id] = 'unknown' elif up_bricks == 0: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = 'up' if int(volume.replica_count) > 1 or \ int(volume.disperse_count) > 0: worst_subvol = max(subvol_states) if worst_subvol > 0: subvol_prob = max( int(volume.replica_count), int(volume.redundancy_count) + 1 ) if worst_subvol == subvol_prob: # if this volume contains only one subvolume, # and the bricks down > redundancy level # then the volume state needs to show down if subvol_count == 1: out_dict[volume.vol_id] = 'down' else: out_dict[volume.vol_id] = '(partial)' else: out_dict[volume.vol_id] = '(degraded)' else: # This volume is not 'protected', so any brick # disruption leads straight to a 'partial' # availability state if up_bricks != total_bricks: out_dict[volume.vol_id] = '(partial)' # Raise the alert if volume state changes if volume.state != "" and \ out_dict[volume.vol_id] not in [volume.state, 'unknown']: msg = "Volume:%s is %s" % (volume.name, out_dict[volume.vol_id]) instance = "volume_%s" % volume.name event_utils.emit_event( "volume_state", out_dict[volume.vol_id], msg, instance, 'INFO' if out_dict[volume.vol_id] == 'up' else 'WARNING', tags={"entity_type": RESOURCE_TYPE_VOLUME, "volume_name": volume.name } ) # Save the volume status volume.state = out_dict[volume.vol_id] volume.save() return out_dict