def find_cluster_by_uuid(_uuid): """ Find a cluster name by searching /etc/ceph/*.conf for a conf file with the right uuid. """ _uuid = _uuid.lower() no_fsid = [] if not os.path.exists(SYSCONFDIR): return None for conf_file in os.listdir(SYSCONFDIR): if not conf_file.endswith('.conf'): continue cluster = conf_file[:-5] try: fsid = get_fsid(cluster) except Exception as e: if 'getting cluster uuid from configuration failed' not in str(e): raise e no_fsid.append(cluster) else: if fsid == _uuid: return cluster # be tolerant of /etc/ceph/ceph.conf without an fsid defined. if len(no_fsid) == 1 and no_fsid[0] == 'ceph': logger.warning('No fsid defined in ' + SYSCONFDIR + '/ceph.conf; using anyway') return 'ceph' return None
def do_read_image_metadata(self, io_ctx, meta_object): params = {} iterator = io_ctx.get_xattrs(meta_object) try: while True: try: pair = iterator.next() (key, value) = pair if str(value) == "": params[key] = "" else: params[key] = value except StopIteration as e: break except Exception as e: logger.warning("Cannot parse metadata.") break return params except Exception as e: raise MetadataException("Cannot get metadata.")
def test_active_clean_old(): cluster_name = configuration().get_cluster_name() sleeps = [10, 15, 20, 25, 30, 40] tries = 5 while tries: status = False try: out, err = exec_command( "ceph --cluster {} -f json pg stat".format(cluster_name)) ceph_pg_stat = str(out).replace("'", '') ceph_pg_stat = json.loads(ceph_pg_stat) logger.info("Ceph status is " + ceph_pg_stat['num_pg_by_state'][0]['name']) if str(ceph_pg_stat['num_pg_by_state'][0] ['name']) == 'active+clean': status = True else: status = False except Exception as e: logger.error("Get ceph status returned error.\n" + e.message) if not status: tries -= 1 sleep_seconds = sleeps.pop() logger.warning( 'waiting %s seconds before retrying to check active+clean status', sleep_seconds) time.sleep(sleep_seconds) else: # Nautilius call pool init when active : call_cmd('rbd pool init rbd') break
def __fencing(self, node_name): maintenance = ManageMaintenance() if maintenance.get_maintenance_config( ).fencing == MaintenanceConfigState.off: logger.warning( "Fencing action will not fire the admin stopped it,the cluster is in maintenance mode." ) return node_list = ConsulAPI().get_node_list() for node in node_list: if str(node.name) == node_name: if Network().ping(node.backend_2_ip): logger.info("This node will stop node {}/{}.".format( node_name, node.backend_2_ip)) ssh().call_command(node.backend_2_ip, " poweroff ", 5) break elif Network().ping(node.management_ip): logger.info("This node will stop node {}/{}.".format( node_name, node.management_ip)) ssh().call_command(node.management_ip, " poweroff ", 5) break elif Network().ping(node.backend_1_ip): logger.info("This node will stop node {}/{}.".format( node_name, node.backend_1_ip)) ssh().call_command(node.backend_1_ip, " poweroff ", 5) break
def delete_osd_from_crush_map(osd_id): cluster_name = configuration().get_cluster_name() logger.info("Start remove osd.{} from crush map".format(osd_id)) is_executing_without_err = True if not call_cmd("ceph --cluster {} osd out osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph osd out osd.{}".format(osd_id)) is_executing_without_err = False if not call_cmd("ceph --cluster {} osd crush remove osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph osd crush remove osd.{}".format(osd_id)) is_executing_without_err = False if not call_cmd("ceph --cluster {} auth del osd.{}".format(cluster_name, osd_id)): logger.error("Error executing ceph auth del osd.{}".format(osd_id)) is_executing_without_err = False # Try to delete the osd completely from ceph in case the osd is up the next command will not execute if not call_cmd("ceph --cluster {} osd rm osd.{}".format(cluster_name, osd_id)): logger.warning("The osd still up you need to stop osd service of osd.{}".format(osd_id)) if is_executing_without_err: logger.info("osd.{} is removed from crush map".format(osd_id)) else: logger.warning("osd.{} is removed from crush map".format(osd_id))
def run(self): try: result = Result() ceph_api = CephAPI() cluster_status = ceph_api.get_ceph_cluster_status() if cluster_status is not None: cluster_status = json.loads(cluster_status) available_size = 0 used_size = 0 if cluster_status['pgmap']['bytes_total'] > 0: available_size = cluster_status['pgmap']['bytes_avail'] * 100.0 / cluster_status['pgmap']['bytes_total'] used_size = cluster_status['pgmap']['bytes_used'] * 100.0 / cluster_status['pgmap']['bytes_total'] notify_cluster_space_percent = ConfigAPI().get_notify_cluster_used_space_percent() if float(used_size) > float(notify_cluster_space_percent): check_state = self.__context.state.get(self.get_plugin_name(), False) if check_state == False: result.title = gettext("core_message_notify_title_cluster_out_space") result.message = '\n'.join(gettext("core_message_notify_cluster_out_space").split("\\n")).format(int(available_size)) # logger.warning(result.message) result.plugin_name = str(self.get_plugin_name()) self.__context.results.append(result) self.__context.state[self.get_plugin_name()] = True logger.warning("Cluster is running out of disk space") return self.__context.state[self.get_plugin_name()] = False except: logger.exception("Error occur during get cluster state")
def get_dev_size(dev, size='megabytes'): fd = os.open(dev, os.O_RDONLY) dividers = {'bytes': 1, 'megabytes': 1024 * 1024} try: device_size = os.lseek(fd, 0, os.SEEK_END) divider = dividers.get(size, 1024 * 1024) # default to megabytes return device_size // divider except Exception as error: logger.warning('failed to get size of %s: %s' % (dev, str(error))) finally: os.close(fd)
def __test_leaders(): sleeps = [15, 15, 10, 10, 5, 5] tries = 5 leaders_in_cluster = [] cluster_members = [] cluster_conf = configuration() current_cluster_info = cluster_conf.get_cluster_info() current_node_info = cluster_conf.get_node_info() cluster_members.append(current_node_info.name) for i in current_cluster_info.management_nodes: node_info = NodeInfo() node_info.load_json(json.dumps(i)) cluster_members.append(node_info.name) status_report = StatusReport() for host in cluster_members: while tries: status = None try: status = _leader_status_check_(host) except Exception as exc: logger.error("Error Connecting to consul for leader check.") # if not has_reached_quorum: if not status: tries -= 1 sleep_seconds = sleeps.pop() logger.warning('waiting %s seconds before retrying', sleep_seconds) # time.sleep(sleep_seconds) sleep(sleep_seconds) status_report.success = False else: leaders_in_cluster.append(host) logger.info('Cluster Node {} joined the cluster and is alive' + host) status_report.success = True break if status_report.success is False: status_report.failed_tasks.append( 'core_consul_deploy_build_node_fail_join_cluster_not_alive' + "%" + str(host)) if leaders_in_cluster == cluster_members: logger.info("Consul leaders are ready") status_report.success = True return status_report else: logger.error("Consul leaders are not ready") return status_report
def run(self): try: result = Result() ceph_status_overall = "" ceph_api = CephAPI() cluster_status = ceph_api.get_ceph_cluster_status() # ceph status --format json-pretty if cluster_status is not None: cluster_status = json.loads(cluster_status) # Ceph 12 : if "overall_status" in cluster_status["health"] and cluster_status["health"]["overall_status"] is not None: ceph_status_overall = cluster_status["health"]["overall_status"] else: ceph_status_overall = cluster_status["health"]["status"] if ceph_status_overall == "HEALTH_ERR": prv_err = self.__context.state.get(self.get_plugin_name(), False) if not prv_err: ceph_health_obj = cluster_status["health"] summary_messages = "" summary_messages_ls = [] if "checks" in ceph_health_obj: for key in ceph_health_obj["checks"]: if ceph_health_obj["checks"][key] is not None: msg = ceph_health_obj["checks"][key]["summary"]["message"] summary_messages_ls.append(msg) summary_messages = '\n '.join(summary_messages_ls) result.title = gettext("core_message_notify_cluster_status_title") result.message = '\n'.join(gettext("core_message_notify_cluster_status_body").split("\\n")).format(summary_messages) result.plugin_name = str(self.get_plugin_name()) self.__context.results.append(result) self.__context.state[self.get_plugin_name()] = True logger.warning("Cluster overall health status is HEALTH_ERR") return self.__context.state[self.get_plugin_name()] = False except Exception as e: logger.exception(e) logger.error("An error occurred while ClusterStatusPlugin was running.")
def connect(self): RETRY_COUNTER = 7 INTERVAL = 2 i = 1 while i <= RETRY_COUNTER: cluster = self.do_connect() if cluster != -1: break logger.warning("connect() retry({}) Cannot connect to ceph cluster.".format(str(i))) sleep(INTERVAL) # wait 15 sec i += 1 INTERVAL *= 2 return cluster
def test_active_clean(): cluster_name = configuration().get_cluster_name() sleeps = [10, 15, 20, 25, 30, 40] tries = 5 while tries: ceph_api = CephAPI() active_pools = ceph_api.get_active_pools() if 'rbd' in active_pools: logger.info('rbd pool is active') break tries -= 1 sleep_seconds = sleeps.pop() logger.warning( 'waiting %s seconds before retrying to check rbd pool status', sleep_seconds) time.sleep(sleep_seconds)
def log_replication_job(self, job_id, text): consul_api = ConsulAPI() # Check if Replication Job is existed in Consul job_entity = self.get_replication_job(job_id) if job_entity is None: # if job entity is deleted # logger.warning("The job {} does not exist --- {}".format(job_id, str(text))) else: logs_list = self.get_replication_job_log(job_id) logs_list.append(str(text)) if len(logs_list) > 200: del logs_list[0] consul_api.log_replication_job(job_id, json.dumps(logs_list))
def __test_mons(): sleeps = [15, 15, 10, 10, 5, 5] tries = 5 mon_in_quorum = [] mon_members = [] cluster_conf = configuration() current_cluster_info = cluster_conf.get_cluster_info() for i in current_cluster_info.management_nodes: node_info = NodeInfo() node_info.load_json(json.dumps(i)) mon_members.append(node_info.name) for host in mon_members: while tries: status = mon_status_check() has_reached_quorum = host in status.get('quorum_names', '') if not has_reached_quorum: tries -= 1 sleep_seconds = sleeps.pop() logger.warning('Waiting %s seconds before retrying', sleep_seconds) time.sleep(sleep_seconds) else: mon_in_quorum.append(host) break if mon_in_quorum == mon_members: logger.info("Ceph monitors are ready.") return True else: logger.info("Ceph monitors are not ready.") return False
def start(self): self.__image_name_prefix = self.__app_conf.get_image_name_prefix() # Handel the case of cluster has just started if self.__node_info.is_management: clean_thread = threading.Thread(target=self.handle_cluster_startup) clean_thread.start() logger.info("Service is starting.") keep_resources_flag_path = ConfigAPI().get_keep_resources_flag_path() keep_resources_flag = False clean = True #check if file path exist update keep_resources_flag to be True and remove the file if os.path.exists(keep_resources_flag_path): keep_resources_flag = True os.remove(keep_resources_flag_path) # check if no upgrade needed , then get new session if not keep_resources_flag: try: self.__session = ConsulAPI().get_new_session_ID( self.__session_name, self.__node_info.name) except Exception as e: logger.error(e) self.__session = "0" if not self.__session or self.__session is None: self.__session = "0" # check if upgrade needed, then use the current session to keep consul resource else: keep_resources_flag = False try: sessions = ConsulAPI().get_sessions_dict( 'iSCSITarget', self.__node_info.name) if sessions is not None and len(sessions) == 1: consul_session = sessions.values()[0] self.__session = consul_session.ID clean = False else: self.__session = "0" except Exception as ex: logger.error("Could not get Consul sessions") logger.exception(ex) self.__session = "0" if clean: self.__clean() while True: try: if self.__session == "0": self.__session = ConsulAPI().get_new_session_ID( self.__session_name, self.__node_info.name) consul_api = ConsulAPI() self.__current_lock_index = consul_api.current_index() if not self.__current_lock_index: sleep(1) continue self.__process() old_index = self.__current_lock_index self.__current_lock_index = consul_api.watch( self.__current_lock_index) if old_index != self.__current_lock_index: # Give a chance to get all changes that occurred in the same time in cosnul. sleep(2) self.__exception_retry_timeout = 0 self.__failure_timeout = timedelta( minutes=self.__app_conf.get_failure_timeout_duration_min( )) + datetime.utcnow() except (ConnectionError, RetryConsulException) as ex: logger.error("Error on consul connection.") logger.exception(ex) self.__exception_retry_timeout += 5 except Exception as ex: logger.error("Error during process.") logger.exception(ex) self.__exception_retry_timeout += 1 sleep(self.__exception_retry_timeout) if self.__exception_retry_timeout > 10: logger.warning( "PetaSAN could not complete process, there are too many exceptions." ) self.__exception_retry_timeout = 1 sleep(self.__exception_retry_timeout) # Clean all installed configurations if service did not successfully for 5 minutes. if self.__failure_timeout < datetime.utcnow(): logger.warning( "There are too many exceptions.Service will clean this node." ) self.__clean() self.__session = "0" self.__failure_timeout = timedelta( minutes=self.__app_conf.get_failure_timeout_duration_min( )) + datetime.utcnow()
def __acquire_path(self, path, consul_kv): if self.__ignored_acquire_paths.get(path): logger.info("Ignore forced path {}".format(path)) return logger.debug("Start acquire path {} by node session {}.".format( path, self.__session)) consul_api = ConsulAPI() ceph_api = CephAPI() lio_api = LioAPI() network_api = NetworkAPI() config = configuration() try: disk_id, path_index = str(path).split("/") pool = self._get_pool_by_disk(disk_id) if not pool: logger.error('Could not find pool for disk ' + disk_id) return image_name = self.__image_name_prefix + disk_id logger.debug( "Start read image meta for acquire path {}.".format(path)) all_image_meta = ceph_api.read_image_metadata(image_name, pool) petasan_meta = all_image_meta.get( self.__app_conf.get_image_meta_key()) disk_meta = DiskMeta() disk_meta.load_json(petasan_meta) logger.debug( "End read image meta for acquire path {}.".format(path)) logger.debug("Try to acquire path {}.".format(path)) node_name = config.get_node_name() result = consul_api.lock_disk_path( self.__app_conf.get_consul_disks_path() + path, self.__session, node_name, str(consul_kv.CreateIndex)) if not result: logger.info("Could not lock path {} with session {}.".format( path, self.__session)) elif result: if consul_kv.Value != None and len(str( consul_kv.Value)) > 0 and node_name != str( consul_kv.Value): logger.info("The path {} was locked by {}.".format( path, str(consul_kv.Value))) logger.debug("Node {} will kill node {}.".format( config.get_node_name(), str(consul_kv.Value))) self.__fencing(str(consul_kv.Value)) # we locked it if disk_meta.paths: # if lio has the image name in its backstore already, do not perform rbd mapping if image_name not in self.__backstore: status = ceph_api.map_iamge(image_name, pool) else: status = Status.done if Status.done == status: # Get path info from metadata path_obj = disk_meta.get_paths()[int(path_index) - 1] # add path ips to our network interfaces network_api.add_ip(path_obj.ip, path_obj.subnet_mask, path_obj.eth, path_obj.vlan_id) #update neighbors arp table network_api.update_neighbors_arp( path_obj.ip, path_obj.eth) # add new target in lio if not there already if not lio_api.is_backstore_image_found(image_name): # Give ceph map image complete it job sleep(3) # Add rbd backstores and target status = lio_api.add_target( disk_meta, disk_meta.pool) """ wwn = self.calculate_disk_wwn(disk_meta) status = lio_api.add_target(disk_meta, wwn, disk_meta.pool) """ if Status.done == status: # enable the path we locked to true self.__last_acquire_succeeded = True lio_api.enable_path(disk_meta.iqn, path_index, True) logger.info("Path %s acquired successfully" % path) if self.__acquire_warning_counter > 2: logger.info( "PetaSAN finally succeeded to acquire path after retrying {} times." .format(self.__acquire_warning_counter)) self.__acquire_warning_counter = 0 path_assignment_info = self.__force_acquire_paths.get( path) if path_assignment_info: MangePathAssignment().update_path( path_obj.ip, ReassignPathStatus.succeeded) else: path_assignment_info = self.__force_acquire_paths.get( path) if path_assignment_info: logger.info( "Acquired forced path {}".format(path)) MangePathAssignment().update_path( path_obj.ip, ReassignPathStatus.failed) self.__last_acquire_succeeded = False if self.__acquire_warning_counter > 2: logger.warning( "PetaSAN failed to acquire path after {} times." .format(self.__acquire_warning_counter)) self.__acquire_warning_counter += 1 logger.error("Error could not acquire path %s" % path) else: self.__unlock_consul_path(path) except Exception as e: logger.info("---------------------------------") logger.error(str(e.message) + "\n") logger.exception(e) if str(e.message).find("invalid session") > -1: logger.error("Session is invalid") try: logger.info("Trying to create new session id") self.__session = ConsulAPI().get_new_session_ID( self.__session_name, self.__node_info.name) logger.info("New session id is {}".format(self.__session)) logger.info("Cleaning all mapped disks from old session") self.__clean() except Exception as ex: logger.exception(ex) logger.exception("Could not acquire path %s" % path) raise e logger.debug("End acquire path {}.".format(path)) return