Exemplo n.º 1
0
def find_cluster_by_uuid(_uuid):
    """
    Find a cluster name by searching /etc/ceph/*.conf for a conf file
    with the right uuid.
    """
    _uuid = _uuid.lower()
    no_fsid = []
    if not os.path.exists(SYSCONFDIR):
        return None
    for conf_file in os.listdir(SYSCONFDIR):
        if not conf_file.endswith('.conf'):
            continue
        cluster = conf_file[:-5]
        try:
            fsid = get_fsid(cluster)
        except Exception as e:
            if 'getting cluster uuid from configuration failed' not in str(e):
                raise e
            no_fsid.append(cluster)
        else:
            if fsid == _uuid:
                return cluster
    # be tolerant of /etc/ceph/ceph.conf without an fsid defined.
    if len(no_fsid) == 1 and no_fsid[0] == 'ceph':
        logger.warning('No fsid defined in ' + SYSCONFDIR +
                       '/ceph.conf; using anyway')
        return 'ceph'
    return None
Exemplo n.º 2
0
    def do_read_image_metadata(self, io_ctx, meta_object):
        params = {}
        iterator = io_ctx.get_xattrs(meta_object)

        try:
            while True:
                try:
                    pair = iterator.next()
                    (key, value) = pair

                    if str(value) == "":
                        params[key] = ""
                    else:
                        params[key] = value

                except StopIteration as e:
                    break

                except Exception as e:
                    logger.warning("Cannot parse metadata.")
                    break

            return params

        except Exception as e:
            raise MetadataException("Cannot get metadata.")
Exemplo n.º 3
0
def test_active_clean_old():
    cluster_name = configuration().get_cluster_name()
    sleeps = [10, 15, 20, 25, 30, 40]
    tries = 5

    while tries:
        status = False
        try:
            out, err = exec_command(
                "ceph --cluster {} -f json pg stat".format(cluster_name))
            ceph_pg_stat = str(out).replace("'", '')
            ceph_pg_stat = json.loads(ceph_pg_stat)
            logger.info("Ceph status is " +
                        ceph_pg_stat['num_pg_by_state'][0]['name'])

            if str(ceph_pg_stat['num_pg_by_state'][0]
                   ['name']) == 'active+clean':
                status = True
            else:
                status = False
        except Exception as e:
            logger.error("Get ceph status returned error.\n" + e.message)

        if not status:
            tries -= 1
            sleep_seconds = sleeps.pop()
            logger.warning(
                'waiting %s seconds before retrying to check active+clean status',
                sleep_seconds)
            time.sleep(sleep_seconds)
        else:
            # Nautilius call pool init when active :
            call_cmd('rbd pool init rbd')
            break
Exemplo n.º 4
0
    def __fencing(self, node_name):
        maintenance = ManageMaintenance()
        if maintenance.get_maintenance_config(
        ).fencing == MaintenanceConfigState.off:
            logger.warning(
                "Fencing action will not fire the admin stopped it,the cluster is in maintenance mode."
            )
            return

        node_list = ConsulAPI().get_node_list()
        for node in node_list:

            if str(node.name) == node_name:
                if Network().ping(node.backend_2_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.backend_2_ip))
                    ssh().call_command(node.backend_2_ip, " poweroff ", 5)
                    break
                elif Network().ping(node.management_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.management_ip))
                    ssh().call_command(node.management_ip, " poweroff ", 5)
                    break
                elif Network().ping(node.backend_1_ip):
                    logger.info("This node will stop node {}/{}.".format(
                        node_name, node.backend_1_ip))
                    ssh().call_command(node.backend_1_ip, " poweroff ", 5)
                    break
Exemplo n.º 5
0
def delete_osd_from_crush_map(osd_id):
    cluster_name = configuration().get_cluster_name()
    logger.info("Start remove osd.{} from crush map".format(osd_id))
    is_executing_without_err = True

    if not call_cmd("ceph --cluster {} osd out osd.{}".format(cluster_name, osd_id)):
        logger.error("Error executing ceph osd out osd.{}".format(osd_id))
        is_executing_without_err = False

    if not call_cmd("ceph --cluster {} osd crush remove osd.{}".format(cluster_name, osd_id)):
        logger.error("Error executing ceph osd crush remove osd.{}".format(osd_id))
        is_executing_without_err = False

    if not call_cmd("ceph --cluster {} auth del osd.{}".format(cluster_name, osd_id)):
        logger.error("Error executing ceph auth del osd.{}".format(osd_id))
        is_executing_without_err = False

    # Try to delete the osd completely from ceph in case the osd is up the next command will not execute
    if not call_cmd("ceph --cluster {} osd rm osd.{}".format(cluster_name, osd_id)):
        logger.warning("The osd still up you need to stop osd service of osd.{}".format(osd_id))

    if is_executing_without_err:
        logger.info("osd.{} is removed from crush map".format(osd_id))
    else:
        logger.warning("osd.{} is removed from crush map".format(osd_id))
Exemplo n.º 6
0
    def run(self):
        try:
            result = Result()
            ceph_api = CephAPI()
            cluster_status = ceph_api.get_ceph_cluster_status()
            if cluster_status is not None:
                cluster_status = json.loads(cluster_status)

                available_size = 0
                used_size = 0

                if cluster_status['pgmap']['bytes_total'] > 0:
                    available_size = cluster_status['pgmap']['bytes_avail'] * 100.0 / cluster_status['pgmap']['bytes_total']
                    used_size = cluster_status['pgmap']['bytes_used'] * 100.0 / cluster_status['pgmap']['bytes_total']

                notify_cluster_space_percent = ConfigAPI().get_notify_cluster_used_space_percent()

                if float(used_size) > float(notify_cluster_space_percent):
                    check_state = self.__context.state.get(self.get_plugin_name(), False)

                    if check_state == False:
                        result.title = gettext("core_message_notify_title_cluster_out_space")
                        result.message = '\n'.join(gettext("core_message_notify_cluster_out_space").split("\\n")).format(int(available_size))
                        # logger.warning(result.message)
                        result.plugin_name = str(self.get_plugin_name())
                        self.__context.results.append(result)
                        self.__context.state[self.get_plugin_name()] = True
                        logger.warning("Cluster is running out of disk space")
                    return
                self.__context.state[self.get_plugin_name()] = False

        except:
            logger.exception("Error occur during get cluster state")
Exemplo n.º 7
0
def get_dev_size(dev, size='megabytes'):
    fd = os.open(dev, os.O_RDONLY)
    dividers = {'bytes': 1, 'megabytes': 1024 * 1024}
    try:
        device_size = os.lseek(fd, 0, os.SEEK_END)
        divider = dividers.get(size, 1024 * 1024)  # default to megabytes
        return device_size // divider
    except Exception as error:
        logger.warning('failed to get size of %s: %s' % (dev, str(error)))
    finally:
        os.close(fd)
Exemplo n.º 8
0
def __test_leaders():
    sleeps = [15, 15, 10, 10, 5, 5]
    tries = 5

    leaders_in_cluster = []
    cluster_members = []

    cluster_conf = configuration()
    current_cluster_info = cluster_conf.get_cluster_info()

    current_node_info = cluster_conf.get_node_info()
    cluster_members.append(current_node_info.name)

    for i in current_cluster_info.management_nodes:
        node_info = NodeInfo()
        node_info.load_json(json.dumps(i))
        cluster_members.append(node_info.name)

    status_report = StatusReport()

    for host in cluster_members:
        while tries:
            status = None
            try:
                status = _leader_status_check_(host)
            except Exception as exc:
                logger.error("Error Connecting to consul for leader check.")
            # if not has_reached_quorum:
            if not status:
                tries -= 1
                sleep_seconds = sleeps.pop()
                logger.warning('waiting %s seconds before retrying',
                               sleep_seconds)
                # time.sleep(sleep_seconds)
                sleep(sleep_seconds)
                status_report.success = False
            else:
                leaders_in_cluster.append(host)
                logger.info('Cluster Node {} joined the cluster and is alive' +
                            host)
                status_report.success = True
                break
        if status_report.success is False:
            status_report.failed_tasks.append(
                'core_consul_deploy_build_node_fail_join_cluster_not_alive' +
                "%" + str(host))
    if leaders_in_cluster == cluster_members:
        logger.info("Consul leaders are ready")
        status_report.success = True
        return status_report

    else:
        logger.error("Consul leaders are not ready")
        return status_report
Exemplo n.º 9
0
    def run(self):
        try:
            result = Result()
            ceph_status_overall = ""

            ceph_api = CephAPI()
            cluster_status = ceph_api.get_ceph_cluster_status()                       # ceph status --format json-pretty

            if cluster_status is not None:
                cluster_status = json.loads(cluster_status)

                # Ceph 12 :
                if "overall_status" in cluster_status["health"] and cluster_status["health"]["overall_status"] is not None:
                    ceph_status_overall = cluster_status["health"]["overall_status"]
                else:
                    ceph_status_overall = cluster_status["health"]["status"]

                if ceph_status_overall == "HEALTH_ERR":
                    prv_err  = self.__context.state.get(self.get_plugin_name(), False)

                    if not prv_err:
                        ceph_health_obj = cluster_status["health"]
                        summary_messages = ""
                        summary_messages_ls = []

                        if "checks" in ceph_health_obj:
                            for key in ceph_health_obj["checks"]:
                                if ceph_health_obj["checks"][key] is not None:
                                    msg = ceph_health_obj["checks"][key]["summary"]["message"]
                                    summary_messages_ls.append(msg)

                        summary_messages = '\n    '.join(summary_messages_ls)

                        result.title = gettext("core_message_notify_cluster_status_title")
                        result.message = '\n'.join(gettext("core_message_notify_cluster_status_body").split("\\n")).format(summary_messages)

                        result.plugin_name = str(self.get_plugin_name())
                        self.__context.results.append(result)
                        self.__context.state[self.get_plugin_name()] = True
                        logger.warning("Cluster overall health status is HEALTH_ERR")

                    return

                self.__context.state[self.get_plugin_name()] = False

        except Exception as e:
            logger.exception(e)
            logger.error("An error occurred while ClusterStatusPlugin was running.")
    def connect(self):

        RETRY_COUNTER = 7
        INTERVAL = 2
        i = 1

        while i <= RETRY_COUNTER:
            cluster = self.do_connect()
            if cluster != -1:
                break

            logger.warning("connect() retry({}) Cannot connect to ceph cluster.".format(str(i)))
            sleep(INTERVAL)  # wait 15 sec
            i += 1
            INTERVAL *= 2

        return cluster
Exemplo n.º 11
0
def test_active_clean():
    cluster_name = configuration().get_cluster_name()
    sleeps = [10, 15, 20, 25, 30, 40]
    tries = 5

    while tries:
        ceph_api = CephAPI()
        active_pools = ceph_api.get_active_pools()
        if 'rbd' in active_pools:
            logger.info('rbd pool is active')
            break
        tries -= 1
        sleep_seconds = sleeps.pop()
        logger.warning(
            'waiting %s seconds before retrying to check rbd pool status',
            sleep_seconds)
        time.sleep(sleep_seconds)
Exemplo n.º 12
0
    def log_replication_job(self, job_id, text):
        consul_api = ConsulAPI()

        # Check if Replication Job is existed in Consul
        job_entity = self.get_replication_job(job_id)

        if job_entity is None:
            # if job entity is deleted #
            logger.warning("The job {} does not exist --- {}".format(job_id, str(text)))

        else:
            logs_list = self.get_replication_job_log(job_id)
            logs_list.append(str(text))

            if len(logs_list) > 200:
                del logs_list[0]

            consul_api.log_replication_job(job_id, json.dumps(logs_list))
Exemplo n.º 13
0
def __test_mons():
    sleeps = [15, 15, 10, 10, 5, 5]
    tries = 5
    mon_in_quorum = []
    mon_members = []

    cluster_conf = configuration()
    current_cluster_info = cluster_conf.get_cluster_info()

    for i in current_cluster_info.management_nodes:
        node_info = NodeInfo()
        node_info.load_json(json.dumps(i))
        mon_members.append(node_info.name)

    for host in mon_members:
        while tries:
            status = mon_status_check()
            has_reached_quorum = host in status.get('quorum_names', '')

            if not has_reached_quorum:
                tries -= 1
                sleep_seconds = sleeps.pop()
                logger.warning('Waiting %s seconds before retrying',
                               sleep_seconds)
                time.sleep(sleep_seconds)
            else:
                mon_in_quorum.append(host)
                break

    if mon_in_quorum == mon_members:
        logger.info("Ceph monitors are ready.")
        return True

    else:
        logger.info("Ceph monitors are not ready.")
        return False
Exemplo n.º 14
0
    def start(self):
        self.__image_name_prefix = self.__app_conf.get_image_name_prefix()
        # Handel the case of cluster has just started
        if self.__node_info.is_management:
            clean_thread = threading.Thread(target=self.handle_cluster_startup)
            clean_thread.start()

        logger.info("Service is starting.")

        keep_resources_flag_path = ConfigAPI().get_keep_resources_flag_path()
        keep_resources_flag = False
        clean = True

        #check if file path exist update keep_resources_flag to be True and remove the file
        if os.path.exists(keep_resources_flag_path):
            keep_resources_flag = True
            os.remove(keep_resources_flag_path)

        # check if no upgrade needed , then get new session
        if not keep_resources_flag:
            try:
                self.__session = ConsulAPI().get_new_session_ID(
                    self.__session_name, self.__node_info.name)
            except Exception as e:
                logger.error(e)
                self.__session = "0"

            if not self.__session or self.__session is None:
                self.__session = "0"

        # check if upgrade needed, then use the current session to keep consul resource
        else:
            keep_resources_flag = False
            try:
                sessions = ConsulAPI().get_sessions_dict(
                    'iSCSITarget', self.__node_info.name)
                if sessions is not None and len(sessions) == 1:
                    consul_session = sessions.values()[0]
                    self.__session = consul_session.ID
                    clean = False
                else:
                    self.__session = "0"

            except Exception as ex:
                logger.error("Could not get Consul sessions")
                logger.exception(ex)
                self.__session = "0"

            if clean:
                self.__clean()

        while True:
            try:
                if self.__session == "0":
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)

                consul_api = ConsulAPI()
                self.__current_lock_index = consul_api.current_index()
                if not self.__current_lock_index:
                    sleep(1)
                    continue
                self.__process()
                old_index = self.__current_lock_index
                self.__current_lock_index = consul_api.watch(
                    self.__current_lock_index)
                if old_index != self.__current_lock_index:
                    # Give a chance to get all changes that occurred in the same time in cosnul.
                    sleep(2)

                self.__exception_retry_timeout = 0
                self.__failure_timeout = timedelta(
                    minutes=self.__app_conf.get_failure_timeout_duration_min(
                    )) + datetime.utcnow()
            except (ConnectionError, RetryConsulException) as ex:
                logger.error("Error on consul connection.")
                logger.exception(ex)
                self.__exception_retry_timeout += 5
            except Exception as ex:
                logger.error("Error during process.")
                logger.exception(ex)
                self.__exception_retry_timeout += 1

            sleep(self.__exception_retry_timeout)
            if self.__exception_retry_timeout > 10:
                logger.warning(
                    "PetaSAN could not complete process, there are too many exceptions."
                )
                self.__exception_retry_timeout = 1
            sleep(self.__exception_retry_timeout)

            # Clean all installed configurations if service did not successfully for 5 minutes.
            if self.__failure_timeout < datetime.utcnow():
                logger.warning(
                    "There are too many exceptions.Service will clean this node."
                )
                self.__clean()
                self.__session = "0"
                self.__failure_timeout = timedelta(
                    minutes=self.__app_conf.get_failure_timeout_duration_min(
                    )) + datetime.utcnow()
Exemplo n.º 15
0
    def __acquire_path(self, path, consul_kv):
        if self.__ignored_acquire_paths.get(path):
            logger.info("Ignore forced path {}".format(path))
            return
        logger.debug("Start acquire path {} by node session {}.".format(
            path, self.__session))
        consul_api = ConsulAPI()
        ceph_api = CephAPI()
        lio_api = LioAPI()
        network_api = NetworkAPI()
        config = configuration()
        try:
            disk_id, path_index = str(path).split("/")
            pool = self._get_pool_by_disk(disk_id)
            if not pool:
                logger.error('Could not find pool for disk ' + disk_id)
                return
            image_name = self.__image_name_prefix + disk_id
            logger.debug(
                "Start read image meta for acquire path {}.".format(path))
            all_image_meta = ceph_api.read_image_metadata(image_name, pool)
            petasan_meta = all_image_meta.get(
                self.__app_conf.get_image_meta_key())
            disk_meta = DiskMeta()
            disk_meta.load_json(petasan_meta)
            logger.debug(
                "End read image meta for acquire path {}.".format(path))

            logger.debug("Try to acquire path {}.".format(path))
            node_name = config.get_node_name()
            result = consul_api.lock_disk_path(
                self.__app_conf.get_consul_disks_path() + path, self.__session,
                node_name, str(consul_kv.CreateIndex))
            if not result:
                logger.info("Could not lock path {} with session {}.".format(
                    path, self.__session))
            elif result:
                if consul_kv.Value != None and len(str(
                        consul_kv.Value)) > 0 and node_name != str(
                            consul_kv.Value):
                    logger.info("The path {} was locked by {}.".format(
                        path, str(consul_kv.Value)))
                    logger.debug("Node {} will kill node {}.".format(
                        config.get_node_name(), str(consul_kv.Value)))
                    self.__fencing(str(consul_kv.Value))

                # we locked it
                if disk_meta.paths:
                    # if lio has the image name in its backstore already, do not perform rbd mapping
                    if image_name not in self.__backstore:
                        status = ceph_api.map_iamge(image_name, pool)
                    else:
                        status = Status.done
                    if Status.done == status:
                        # Get path info from metadata
                        path_obj = disk_meta.get_paths()[int(path_index) - 1]
                        # add path ips to our network interfaces
                        network_api.add_ip(path_obj.ip, path_obj.subnet_mask,
                                           path_obj.eth, path_obj.vlan_id)
                        #update neighbors arp table
                        network_api.update_neighbors_arp(
                            path_obj.ip, path_obj.eth)
                        # add new target in lio if not there already
                        if not lio_api.is_backstore_image_found(image_name):
                            # Give ceph map image complete it job
                            sleep(3)
                            # Add rbd backstores and target
                            status = lio_api.add_target(
                                disk_meta, disk_meta.pool)
                            """
                            wwn = self.calculate_disk_wwn(disk_meta)
                            status = lio_api.add_target(disk_meta, wwn, disk_meta.pool)
                            """
                        if Status.done == status:
                            # enable the path we locked to true
                            self.__last_acquire_succeeded = True
                            lio_api.enable_path(disk_meta.iqn, path_index,
                                                True)
                            logger.info("Path %s acquired successfully" % path)

                            if self.__acquire_warning_counter > 2:
                                logger.info(
                                    "PetaSAN finally succeeded to acquire path after retrying {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter = 0
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)

                            if path_assignment_info:
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.succeeded)
                        else:
                            path_assignment_info = self.__force_acquire_paths.get(
                                path)
                            if path_assignment_info:
                                logger.info(
                                    "Acquired forced path {}".format(path))
                                MangePathAssignment().update_path(
                                    path_obj.ip, ReassignPathStatus.failed)
                            self.__last_acquire_succeeded = False
                            if self.__acquire_warning_counter > 2:
                                logger.warning(
                                    "PetaSAN failed to acquire path after {} times."
                                    .format(self.__acquire_warning_counter))
                                self.__acquire_warning_counter += 1
                            logger.error("Error could not acquire path %s" %
                                         path)

                    else:
                        self.__unlock_consul_path(path)

        except Exception as e:
            logger.info("---------------------------------")
            logger.error(str(e.message) + "\n")
            logger.exception(e)

            if str(e.message).find("invalid session") > -1:
                logger.error("Session is invalid")
                try:
                    logger.info("Trying to create new session id")
                    self.__session = ConsulAPI().get_new_session_ID(
                        self.__session_name, self.__node_info.name)
                    logger.info("New session id is {}".format(self.__session))
                    logger.info("Cleaning all mapped disks from old session")
                    self.__clean()
                except Exception as ex:
                    logger.exception(ex)
            logger.exception("Could not acquire path %s" % path)
            raise e
        logger.debug("End acquire path {}.".format(path))
        return