def save(self, update=True, ttl=None): super(NodeContext, self).save(update) status = self.value + "/status" if ttl: self._ttl = ttl try: etcd_utils.refresh(status, ttl) except etcd.EtcdKeyNotFound: pass
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") for brick in bricks: fetched_brick = NS.gluster.objects.Brick( fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_')).load() try: NS._int.wclient.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_')), recursive=True, ) except etcd.EtcdKeyNotFound: pass job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, {"message": "Update dashboard job %s " "created" % job_id}) job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id }) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info try: NS._int.wclient.delete(volume_brick_path, recursive=True) except etcd.EtcdKeyNotFound: pass _trigger_sync_key = 'clusters/%s/_sync_now' % NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval)
def save(self, update=True, ttl=None): if not self.hash_compare_with_central_store(): _volume = NS.gluster.objects.Volume(vol_id=self.vol_id) _volume.invalidate_hash() super(Brick, self).save(update) status = self.value + "/status" if ttl: etcd_utils.refresh(status, ttl) return
def push_operation(self): etcd_utils.write( "/messages/jobs/%s" % self.message.job_id, Message.to_json(self.message), append=True) etcd_utils.refresh( "/messages/jobs/%s" % self.message.job_id, ttl=NS.config.data['message_retention_time'] ) log_message = ("%s:%s") % ( self.message.job_id, self.message.payload["message"]) return log_message
def save(self, update=True, ttl=None): if not self.hash_compare_with_central_store(): _volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=self.vol_id ).load() _volume.invalidate_hash() super(Brick, self).save(update) status = self.value + "/status" if ttl: etcd_utils.refresh(status, ttl) return
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log("debug", NS.publisher_id, {"message": _msg}) if self._ttl and item['name'] in self._attrs_with_ttl: etcd_utils.write(item['key'], item['value'], quorum=True, ttl=self._ttl) else: etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def save(self, update=True, ttl=None): hash_key_changed = True if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): # No change in hashkey hash_key_changed = False rendered_obj = self.render() watchables = self._defs.get("watch_attrs", []) if self.__class__.__name__ in ['Config', 'Definition'] or \ len(watchables) > 0: for item in rendered_obj: if item['name'] in watchables: _type = self._defs.get("attrs", {}).get( item['name'], {} ).get("type") if _type and _type.lower() in ['json', 'list'] and \ item['value']: try: item['value'] = json.dumps(item['value']) except ValueError: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) logger.log( "debug", NS.publisher_id, {"message": _msg} ) etcd_utils.write(item['key'], item['value'], quorum=True) if hash_key_changed: data_key = self.value + '/data' etcd_utils.write(data_key, self.json) updated_at_key = self.value + '/updated_at' hash_key = self.value + '/hash' etcd_utils.write(updated_at_key, str(time_utils.now())) if hasattr(self, 'hash'): etcd_utils.write(hash_key, self.hash) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def hash_compare_with_central_store(self, ttl=None): try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = etcd_utils.read(_hash_key).value except etcd.EtcdKeyNotFound: return False if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return True else: return False except TypeError: # no hash for this object, save the current hash as is return False
def hash_compare_with_central_store(self, ttl=None): self.render() try: # Generate current in memory object hash self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return True else: return False except TypeError: # no hash for this object, save the current hash as is return False
def test_refresh(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.wclient = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.wreconnect = type("Dummy", (object, ), {}) with patch.object(Client, "refresh") as mock_refresh: etcd_utils.refresh("test_value", 1) assert mock_refresh.assert_called with patch.object(Client, "refresh", raise_etcdconnectionfailed) as mock_refresh: with pytest.raises(etcd.EtcdConnectionFailed): etcd_utils.refresh("test_value", 1) with patch.object(Client, "refresh", raise_etcdkeynotfound) as mock_refresh: with pytest.raises(etcd.EtcdKeyNotFound): etcd_utils.refresh("test_value", 1)
def test_refresh(): setattr(__builtin__, "NS", maps.NamedDict()) setattr(NS, "_int", maps.NamedDict()) NS._int.wclient = importlib.import_module("tendrl.commons" ".tests.fixtures." "client").Client() NS._int.wreconnect = type("Dummy", (object,), {}) with patch.object(Client, "refresh") as mock_refresh: etcd_utils.refresh("test_value", 1) assert mock_refresh.assert_called with patch.object(Client, "refresh", raise_etcdconnectionfailed) as mock_refresh: with pytest.raises(etcd.EtcdConnectionFailed): etcd_utils.refresh("test_value", 1) with patch.object(Client, "refresh", raise_etcdkeynotfound) as mock_refresh: with pytest.raises(etcd.EtcdKeyNotFound): etcd_utils.refresh("test_value", 1)
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: # If local object.hash is equal to # central_store object.hash, return if self.hash_compare_with_central_store(ttl=ttl): return if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: logger.log("debug", NS.publisher_id, { "message": "Writing %s to %s" % (item['key'], item['value']) }) except KeyError: sys.stdout.write("Writing %s to %s \n" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl) self.watch_attrs()
def save(self, update=True, ttl=None): self.render() if "Message" not in self.__class__.__name__: try: # Generate current in memory object hash self.hash = self._hash() _hash_key = "/{0}/hash".format(self.value) _stored_hash = None try: _stored_hash = NS._int.client.read(_hash_key).value except (etcd.EtcdConnectionFailed, etcd.EtcdException) as ex: if type(ex) != etcd.EtcdKeyNotFound: NS._int.reconnect() _stored_hash = NS._int.client.read(_hash_key).value if self.hash == _stored_hash: # No changes in stored object and current object, # dont save current object to central store if ttl: etcd_utils.refresh(self.value, ttl) return except TypeError: # no hash for this object, save the current hash as is pass if update: current_obj = self.load() for attr, val in vars(self).iteritems(): if isinstance(val, (types.FunctionType, types.BuiltinFunctionType, types.MethodType, types.BuiltinMethodType, types.UnboundMethodType)) or \ attr.startswith("_") or attr in ['value', 'list']: continue if val is None and hasattr(current_obj, attr): # if self.attr is None, use attr value from central # store (i.e. current_obj.attr) if getattr(current_obj, attr): setattr(self, attr, getattr(current_obj, attr)) self.updated_at = str(time_utils.now()) for item in self.render(): ''' Note: Log messages in this file have try-except blocks to run in the condition when the node_agent has not been started and name spaces are being created. ''' try: Event( Message(priority="debug", publisher=NS.publisher_id, payload={ "message": "Writing %s to %s" % (item['key'], item['value']) })) except KeyError: sys.stdout.write("Writing %s to %s" % (item['key'], item['value'])) # convert list, dict (json) to python based on definitions _type = self._defs.get("attrs", {}).get(item['name'], {}).get("type") if _type: if _type.lower() in ['json', 'list']: if item['value']: try: item['value'] = json.dumps(item['value']) except ValueError as ex: _msg = "Error save() attr %s for object %s" % \ (item['name'], self.__name__) Event( ExceptionMessage(priority="debug", publisher=NS.publisher_id, payload={ "message": _msg, "exception": ex })) try: NS._int.wclient.write(item['key'], item['value'], quorum=True) except (etcd.EtcdConnectionFailed, etcd.EtcdException): NS._int.wreconnect() NS._int.wclient.write(item['key'], item['value'], quorum=True) if ttl: etcd_utils.refresh(self.value, ttl)
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") try: for brick in bricks: # find fqdn using ip ip = socket.gethostbyname(brick.split(":/")[0]) node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id ).load().fqdn brick = fqdn + ":" + brick.split(":")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_') ).load() # delete brick etcd_utils.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_') ), recursive=True, ) # delete alert dashbaord job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Update dashboard job %s " "created" % job_id } ) # delete brick details from graphite job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete" ) logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id } ) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info etcd_utils.delete( volume_brick_path, recursive=True ) _trigger_sync_key = 'clusters/%s/_sync_now' % \ NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval) except etcd.EtcdKeyNotFound: logger.log( "debug", NS.publisher_id, { "message": "Unable to delete bricks %s" % bricks } )
def volume_remove_brick_force(self, event): time.sleep(self.sync_interval) # Event returns bricks list as space separated single string bricks = event['message']['bricks'].split(" ") try: for brick in bricks: # find fqdn using ip ip = socket.gethostbyname(brick.split(":/")[0]) node_id = etcd_utils.read("indexes/ip/%s" % ip).value fqdn = NS.tendrl.objects.ClusterNodeContext( node_id=node_id).load().fqdn brick = fqdn + ":" + brick.split(":")[-1] fetched_brick = NS.tendrl.objects.GlusterBrick( NS.tendrl_context.integration_id, fqdn=brick.split(":/")[0], brick_dir=brick.split(":/")[1].replace('/', '_')).load() # delete brick etcd_utils.delete( "clusters/{0}/Bricks/all/{1}/{2}".format( NS.tendrl_context.integration_id, brick.split(":/")[0], brick.split(":/")[1].replace('/', '_')), recursive=True, ) # delete alert dashbaord job_id = monitoring_utils.update_dashboard( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, {"message": "Update dashboard job %s " "created" % job_id}) # delete brick details from graphite job_id = monitoring_utils.delete_resource_from_graphite( "%s|%s" % (event['message']['volume'], brick), RESOURCE_TYPE_BRICK, NS.tendrl_context.integration_id, "delete") logger.log( "debug", NS.publisher_id, { "message": "Delete resource from graphite job %s " "created" % job_id }) volume_brick_path = "clusters/{0}/Volumes/{1}/"\ "Bricks".format( NS.tendrl_context.integration_id, fetched_brick.vol_id, ) # remove all the brick infromation under volume as the # subvolume might have changed, let the next sync handle # the updation of brick info etcd_utils.delete(volume_brick_path, recursive=True) _trigger_sync_key = 'clusters/%s/_sync_now' % \ NS.tendrl_context.integration_id etcd_utils.write(_trigger_sync_key, 'true') etcd_utils.refresh(_trigger_sync_key, self.sync_interval) except etcd.EtcdKeyNotFound: logger.log("debug", NS.publisher_id, {"message": "Unable to delete bricks %s" % bricks})
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and _cluster.current_job['status'] == 'failed') or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: index = 1 volumes = raw_data['Volumes'] # instantiating blivet class, this will be used for # getting brick_device_details b = blivet.Blivet() # reset blivet during every sync to get latest information # about storage devices in the machine b.reset() devicetree = b.devicetree total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._enable_disable_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def save(self, update=True, ttl=None): self.invalidate_hash() super(GlobalDetails, self).save(update) status = self.value + "/status" if ttl: etcd_utils.refresh(status, ttl)
def save(self, update=True, ttl=None): super(ClusterNodeContext, self).save(update) status = self.value + "/status" if ttl: etcd_utils.refresh(status, ttl)
def run(self): logger.log( "info", NS.publisher_id, {"message": "%s running" % self.__class__.__name__} ) gluster_brick_dir = NS.gluster.objects.GlusterBrickDir() gluster_brick_dir.save() cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if cluster.cluster_network in [None, ""]: try: node_networks = NS.tendrl.objects.NodeNetwork().load_all() cluster.cluster_network = node_networks[0].subnet cluster.save() except etcd.EtcdKeyNotFound as ex: logger.log( "error", NS.publisher_id, {"message": "Failed to sync cluster network details"} ) _sleep = 0 while not self._complete.is_set(): # To detect out of band deletes # refresh gluster object inventory at config['sync_interval'] SYNC_TTL = int(NS.config.data.get("sync_interval", 10)) + 100 NS.node_context = NS.node_context.load() NS.tendrl_context = NS.tendrl_context.load() if _sleep > 5: _sleep = int(NS.config.data.get("sync_interval", 10)) else: _sleep += 1 try: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if (_cluster.status == "importing" and ( _cluster.current_job['status'] == 'failed')) or \ _cluster.status == "unmanaging" or \ _cluster.status == "set_volume_profiling": time.sleep(_sleep) continue _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() _cnc.is_managed = "yes" _cnc.save() subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state', 'detail' ] ) raw_data = ini2json.ini_to_dict( '/var/run/glusterd-state' ) subprocess.call(['rm', '-rf', '/var/run/glusterd-state']) subprocess.call( [ 'gluster', 'get-state', 'glusterd', 'odir', '/var/run', 'file', 'glusterd-state-vol-opts', 'volumeoptions' ] ) raw_data_options = ini2json.ini_to_dict( '/var/run/glusterd-state-vol-opts' ) subprocess.call( [ 'rm', '-rf', '/var/run/glusterd-state-vol-opts' ] ) sync_object = NS.gluster.objects.\ SyncObject(data=json.dumps(raw_data)) sync_object.save() if "Peers" in raw_data: index = 1 peers = raw_data["Peers"] disconnected_hosts = [] while True: try: peer = NS.tendrl.\ objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], hostname=peers[ 'peer%s.primary_hostname' % index ], state=peers['peer%s.state' % index], connected=peers['peer%s.connected' % index] ) try: stored_peer_status = None # find peer detail using hostname ip = socket.gethostbyname( peers['peer%s.primary_hostname' % index] ) node_id = etcd_utils.read( "/indexes/ip/%s" % ip ).value stored_peer = NS.tendrl.objects.GlusterPeer( peer_uuid=peers['peer%s.uuid' % index], node_id=node_id ).load() stored_peer_status = stored_peer.connected current_status = peers[ 'peer%s.connected' % index ] if stored_peer_status and \ current_status != stored_peer_status: msg = ( "Peer %s in cluster %s " "is %s" ) % ( peers[ 'peer%s.primary_hostname' % index ], _cluster.short_name, current_status ) instance = "peer_%s" % peers[ 'peer%s.primary_hostname' % index ] event_utils.emit_event( "peer_status", current_status, msg, instance, 'WARNING' if current_status != 'Connected' else 'INFO' ) # save current status in actual peer # directory also stored_peer.connected = current_status stored_peer.save() # Disconnected host name to # raise brick alert if current_status.lower() == \ "disconnected": disconnected_hosts.append( peers[ 'peer%s.primary_hostname' % index ] ) except etcd.EtcdKeyNotFound: pass SYNC_TTL += 5 peer.save(ttl=SYNC_TTL) index += 1 except KeyError: break # Raise an alert for bricks when peer disconnected # or node goes down for disconnected_host in disconnected_hosts: brick_status_alert( disconnected_host ) if "Volumes" in raw_data: # create devicetree using lsblk devicetree = get_device_tree() # find lvs lvs = brick_utilization.get_lvs() index = 1 volumes = raw_data['Volumes'] total_brick_count = 0 while True: try: b_count = sync_volumes( volumes, index, raw_data_options.get('Volume Options'), SYNC_TTL + VOLUME_TTL, _cluster.short_name, devicetree, lvs ) index += 1 SYNC_TTL += 1 total_brick_count += b_count - 1 except KeyError: global VOLUME_TTL # from second sync volume ttl is # SYNC_TTL + (no.volumes) * 20 + # (no.of.bricks) * 10 + 160 if index > 1: volume_count = index - 1 # When all nodes are down we are updating all # volumes are down, node status TTL is 160, # So make sure volumes are present in etcd # while raising volume down alert VOLUME_TTL = (volume_count * 20) + ( total_brick_count * 10) + 160 break # populate the volume specific options reg_ex = re.compile("^volume[0-9]+.options+") options = {} for key in volumes.keys(): if reg_ex.match(key): options[key] = volumes[key] for key in options.keys(): volname = key.split('.')[0] vol_id = volumes['%s.id' % volname] dict1 = {} for k, v in options.items(): if k.startswith('%s.options' % volname): dict1['.'.join(k.split(".")[2:])] = v options.pop(k, None) volume = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id, vol_id=vol_id ).load() if volume.options is not None: dest = dict(volume.options) dest.update(dict1) volume.options = dest volume.save() # Sync cluster global details if "provisioner/%s" % NS.tendrl_context.integration_id \ in NS.node_context.tags: all_volumes = NS.tendrl.objects.GlusterVolume( NS.tendrl_context.integration_id ).load_all() or [] volumes = [] for volume in all_volumes: if not str(volume.deleted).lower() == "true" and \ volume.current_job.get('status', '') \ in ['', 'finished', 'failed'] and \ volume.vol_id not in [None, ''] and \ volume.name not in [None, '']: # only for first sync refresh volume TTL # It will increase TTL based on no.of volumes if _cnc.first_sync_done in [None, "no", ""]: etcd_utils.refresh( volume.value, SYNC_TTL + VOLUME_TTL ) volumes.append(volume) cluster_status.sync_cluster_status( volumes, SYNC_TTL + VOLUME_TTL ) utilization.sync_utilization_details(volumes) client_connections.sync_volume_connections(volumes) georep_details.aggregate_session_status() try: evt.process_events() except etcd.EtcdKeyNotFound: pass rebalance_status.sync_volume_rebalance_status(volumes) rebalance_status.sync_volume_rebalance_estimated_time( volumes ) snapshots.sync_volume_snapshots( raw_data['Volumes'], int(NS.config.data.get( "sync_interval", 10 )) + len(volumes) * 4 ) # update alert count update_cluster_alert_count() # check and enable volume profiling if "provisioner/%s" % NS.tendrl_context.integration_id in \ NS.node_context.tags: self._update_volume_profiling() _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id ).load() if _cluster.exists(): _cluster = _cluster.load() _cluster.last_sync = str(tendrl_now()) # Mark the first sync done flag _cnc = NS.tendrl.objects.ClusterNodeContext( node_id=NS.node_context.node_id ).load() if _cnc.first_sync_done in [None, "no"]: _cnc.first_sync_done = "yes" _cnc.save() if _cluster.current_job.get( 'status', '' ) in ['', 'finished', 'failed'] and \ _cluster.status in [None, ""]: _cluster.save() except Exception as ex: Event( ExceptionMessage( priority="error", publisher=NS.publisher_id, payload={"message": "gluster sds state sync error", "exception": ex } ) ) try: etcd_utils.read( '/clusters/%s/_sync_now' % NS.tendrl_context.integration_id ) continue except etcd.EtcdKeyNotFound: pass time.sleep(_sleep) logger.log( "debug", NS.publisher_id, {"message": "%s complete" % self.__class__.__name__} )
def sync(sync_ttl=None): try: tags = [] # update node agent service details logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating Service data"}) for service in TENDRL_SERVICES: s = NS.tendrl.objects.Service(service=service) if s.running: service_tag = NS.compiled_definitions.get_parsed_defs( )['namespace.tendrl']['tags'][service.strip("@*")] tags.append(service_tag) if service_tag == "tendrl/server": tags.append("tendrl/monitor") s.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = NS.tendrl.objects.Cluster( integration_id=NS.tendrl_context.integration_id).load() # Try to claim orphan "provisioner_%integration_id" tag _tag = "provisioner/%s" % _cluster.integration_id _is_new_provisioner = False NS.node_context = NS.tendrl.objects.NodeContext().load() if _tag not in NS.node_context.tags: try: _index_key = "/indexes/tags/%s" % _tag _node_id = json.dumps([NS.node_context.node_id]) etcd_utils.write(_index_key, _node_id, prevExist=False) etcd_utils.refresh(_index_key, sync_ttl + 50) tags.append(_tag) _is_new_provisioner = True except etcd.EtcdAlreadyExist: pass # updating node context with latest tags logger.log( "debug", NS.publisher_id, {"message": "node_sync, updating node context " "data with tags"}) NS.node_context = NS.tendrl.objects.NodeContext().load() current_tags = list(NS.node_context.tags) tags += current_tags NS.node_context.tags = list(set(tags)) NS.node_context.tags.sort() current_tags.sort() if NS.node_context.tags != current_tags: NS.node_context.save() if "tendrl/monitor" not in tags and \ NS.tendrl_context.integration_id: _cluster = _cluster.load() if _is_new_provisioner and _cluster.is_managed == "yes": _msg = "node_sync, NEW provisioner node found! "\ "re-configuring monitoring (job-id: %s) on this node" payload = { "tags": ["tendrl/node_%s" % NS.node_context.node_id], "run": "tendrl.flows.ConfigureMonitoring", "status": "new", "parameters": { 'TendrlContext.integration_id': NS.tendrl_context.integration_id }, "type": "node" } _job_id = str(uuid.uuid4()) NS.tendrl.objects.Job(job_id=_job_id, status="new", payload=payload).save() logger.log("debug", NS.publisher_id, {"message": _msg % _job_id}) # Update /indexes/tags/:tag = [node_ids] for tag in NS.node_context.tags: index_key = "/indexes/tags/%s" % tag _node_ids = [] try: _node_ids = etcd_utils.read(index_key).value _node_ids = json.loads(_node_ids) except etcd.EtcdKeyNotFound: pass if _node_ids: if "provisioner" in tag: # Check if this is a stale provisioner if NS.node_context.node_id != _node_ids[0]: NS.node_context.tags.remove(tag) NS.node_context.save() continue if NS.node_context.node_id in _node_ids: if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) continue else: _node_ids += [NS.node_context.node_id] else: _node_ids = [NS.node_context.node_id] _node_ids = list(set(_node_ids)) etcd_utils.write(index_key, json.dumps(_node_ids)) if sync_ttl and len(_node_ids) == 1: etcd_utils.refresh(index_key, sync_ttl + 50) logger.log("debug", NS.publisher_id, {"message": "node_sync, Updating detected " "platform"}) except Exception as ex: Event( ExceptionMessage(priority="error", publisher=NS.publisher_id, payload={ "message": "node_sync service and indexes " "sync failed: " + ex.message, "exception": ex }))