示例#1
0
 def update_storages(self, resource_pool_uuid, data):
     if resource_pool_uuid:
         nodes = db_api.get_node_with_all(
             {'resource_pool_uuid': resource_pool_uuid})
     else:
         nodes = db_api.get_node_with_all({})
     # 需要所有节点都有此存储路径才能进行配置
     node_uuid_list = []
     for node in nodes:
         node_uuid_list.append(node.uuid)
         storages = db_api.get_node_storage_all({'node_uuid': node.uuid})
         for path, role in data.items():
             for store in storages:
                 if path == store.path:
                     break
             else:
                 return build_result("NodeStorageNotExist",
                                     node=node.name,
                                     path=path)
     # 配置存储路径,同时清掉之前存储路径的角色
     storages = db_api.get_node_storage_all({})
     for store in storages:
         if store.node_uuid in node_uuid_list:
             for path, role in data.items():
                 if path == store.path:
                     store.role = role
                     break
             else:
                 store.role = ''
             store.soft_update()
     return build_result("Success")
示例#2
0
def update_node_performance():
    nodes = db_api.get_node_with_all({'deleted': False})
    for node in nodes:
        try:
            if node.status == constants.STATUS_ACTIVE:
                ret = monitor_post(
                    node.ip, 'api/v1/monitor/resource_perf_for_database',
                    {'statis_period': 30})
                if ret.get('code') == 0:
                    ret_data = ret.get("data", {})
                    node_utc = ret_data.get("utc", 0)
                    node_datetime = dt.datetime.fromtimestamp(node_utc)
                    insert_data = {
                        "node_uuid": node.uuid,
                        "node_datetime": node_datetime,
                        "monitor_info": json.dumps(ret_data)
                    }
                    logger.debug(
                        "insert monitor performance data success, node_ip: {}, data: {}"
                        .format(node.ip, ret))
                    db_api.add_monitor_half_min(insert_data)
                else:
                    logger.error(
                        "monitor server error, node_ip:{}, ret: {}".format(
                            node.ip, ret))
        except Exception as e:
            logger.error("get service status error:%s", e, exc_info=True)
示例#3
0
    def rollback_publish(self, package_id, package_path):
        logger.info("rollback publish upgrade package on compute nodes")
        nodes = db_api.get_node_with_all({})
        tasks = list()
        failed_nodes = list()

        with ThreadPoolExecutor(max_workers=constants.MAX_THREADS) as executor:
            for node in nodes:
                task = executor.submit(self._sync_delete_package, node.ip,
                                       package_id, package_path)
                tasks.append(task)
            for future in as_completed(tasks):
                res = future.result()
                if res.get("code") != 0:
                    logger.error(
                        "node: %s rollback publish upgrade package failed: %s",
                        res.get("ipaddr", ""), res.get("msg", ""))
                    failed_nodes.append({
                        "ipaddr": res.get("ipaddr", ""),
                        "msg": res.get("msg", "")
                    })

        if failed_nodes:
            return get_error_result("UploadPackageSyncError",
                                    {"failed_nodes": failed_nodes})

        return get_error_result("Success")
示例#4
0
 def check_node_status(self):
     nodes = db_api.get_node_with_all({})
     master = None
     slaves = list()
     for node in nodes:
         if not icmp_ping(
                 node.ip) or node.status == constants.STATUS_SHUTDOWN:
             return get_error_result("NodeIPConnetFail")
         if node.type in [
                 constants.ROLE_MASTER_AND_COMPUTE, constants.ROLE_MASTER
         ]:
             master = node.ip
         else:
             slaves.append(node.ip)
     return {"master": master, "slaves": slaves}
示例#5
0
    def publish(self, package_id, package_path, md5_value=None):
        logger.info("sync the upgrade package to compute nodes")
        controller_image = db_api.get_controller_image()
        nodes = db_api.get_node_with_all({})
        tasks = list()
        failed_nodes = list()
        bind = SERVER_CONF.addresses.get_by_default('upgrade_bind', '')
        if bind:
            port = bind.split(':')[-1]
        else:
            port = constants.UPGRADE_DEFAULT_PORT

        with ThreadPoolExecutor(max_workers=constants.MAX_THREADS) as executor:
            for node in nodes:
                if node.type in [
                        constants.ROLE_MASTER_AND_COMPUTE,
                        constants.ROLE_MASTER
                ]:
                    continue
                task = executor.submit(
                    self._sync_download_package,
                    "http://%s:%s" % (controller_image.ip, port), node.ip,
                    package_id, package_path, md5_value)
                tasks.append(task)
            for future in as_completed(tasks):
                res = future.result()
                if res.get("code") != 0:
                    logger.error("node :%s sync upgrade package failed: %s",
                                 res.get("ipaddr", ""), res.get("msg", ""))
                    failed_nodes.append({
                        "ipaddr": res.get("ipaddr", ""),
                        "msg": res.get("msg", "")
                    })

        if failed_nodes:
            return get_error_result("UploadPackageSyncError",
                                    {"failed_nodes": failed_nodes})

        return get_error_result("Success")
示例#6
0
def update_node_status():
    # 启用HA后,主备控节点的type是动态的,先检查HA信息,确保节点type是正确的
    update_ha_master()

    nodes = db_api.get_node_with_all({'deleted': False})
    for node in nodes:
        is_restart = False
        is_shutdowning = False
        if node.status == constants.STATUS_DELETING:
            continue
        if node.status == constants.STATUS_SHUTDOWNING:
            is_shutdowning = True
        #     continue
        logger.debug("node %s updateing", node.name)
        status = constants.STATUS_ACTIVE
        if node.status == constants.STATUS_RESTARTING:
            restart_time = node.updated_at
            now = datetime.datetime.utcnow()
            if float(str(now - restart_time).split(":")[-1]) <= 120:
                if not icmp_ping(node.ip, count=2):
                    continue
                is_restart = True
        if not icmp_ping(node.ip, count=3):
            if not is_restart:
                status = constants.STATUS_SHUTDOWN
        # rep_json = check_node_status(node.ip)
        # if rep_json.get('code') != 0:
        #     status = constants.STATUS_SHUTDOWN
        else:
            try:
                ret = monitor_post(node.ip, 'api/v1/monitor/memory', {})
                if ret.get('code') == 0:
                    mem_info = ret['data']
                    node.running_mem = mem_info[
                        "available"] / 1024 / 1024 / 1024
                    node.total_mem = mem_info['total'] / 1024 / 1024 / 1024
                    node.mem_utilization = mem_info["utilization"]
                    ret = monitor_post(node.ip, 'api/v1/monitor/cpu', {})
                    cpu_ratio = 0
                    if ret.get('code') == 0:
                        cpu_info = ret['data']
                        cpu_ratio = cpu_info["utilization"]
                        node.cpu_utilization = cpu_info["utilization"]
                    node.soft_update()
                    if cpu_ratio >= 95:
                        status = constants.STATUS_ERROR
                    ret = monitor_post(node.ip, 'api/v1/monitor/service', {})
                    if ret.get('code') == 0:
                        services = ret['data']
                        not_running_services = list(
                            filter(
                                lambda service: services[service] != 'running',
                                services.keys()))
                        if node.type in [
                                constants.ROLE_MASTER_AND_COMPUTE,
                                constants.ROLE_MASTER
                        ]:
                            node_services = constants.MASTER_SERVICE
                        elif node.type in [
                                constants.ROLE_SLAVE_AND_COMPUTE,
                                constants.ROLE_COMPUTE
                        ]:
                            node_services = constants.COMPUTE_SERVICE
                        else:
                            node_services = []
                        update_service_status(node, services, node_services)
                        for service in not_running_services:
                            if service in node_services:
                                logger.error("service %s is not running",
                                             service)
                                status = constants.STATUS_ERROR
                                break
                    else:
                        status = constants.STATUS_ERROR
                else:
                    status = constants.STATUS_ERROR
            except Exception as e:
                logger.error("get service status error:%s", e, exc_info=True)
                status = constants.STATUS_ERROR
        if node.status != status:
            if status == constants.STATUS_ERROR and is_restart and node.type not in [
                    1, 3
            ]:
                continue
            elif status == constants.STATUS_ERROR and is_shutdowning and node.type not in [
                    1, 3
            ]:
                continue
            logger.info("node %s status change from %s to %s", node.ip,
                        node.status, status)
            node.status = status
            node.soft_update()
        # 只要节点没关机,就可以请求monitor服务去获取磁盘使用信息
        if status != constants.STATUS_SHUTDOWN:
            update_node_storage(node.ip, node.uuid)
示例#7
0
 def get_top_data(self, statis_period):
     logger.debug("get nodes resource_statis top5 data, period {}".format(
         statis_period))
     url = "/api/v1/monitor/resource_statis"
     request_data = {"statis_period": statis_period}
     all_node_cpu_info = []
     all_node_memory_info = []
     all_node_disk_info = []
     all_node_nic_info = []
     # get all nodes ip
     try:
         nodes = db_api.get_node_with_all(
             {'status': constants.STATUS_ACTIVE})
         workers = len(nodes) if len(nodes) > 0 else 1
         all_task = []
         with ThreadPoolExecutor(max_workers=workers) as executor:
             for node in nodes:
                 # get node system resource perf statistic info
                 request_data.update({
                     "node_name": node.name,
                     "node_uuid": node.uuid,
                     "node_ip": node.ip
                 })
                 future = executor.submit(monitor_post, node.ip, url,
                                          request_data)
                 all_task.append(future)
             for future in as_completed(all_task):
                 rep_json = future.result()
                 logger.debug("rep:%s", rep_json)
                 if rep_json["code"] != 0:
                     logger.error(
                         "get node:{} resource_statis info fail".format(
                             rep_json))
                     continue
                 node_name = rep_json.get("data").get("node_name", "")
                 node_uuid = rep_json.get("data").get("node_uuid", "")
                 node_ip = rep_json.get("data").get("node_ip", "")
                 all_node_cpu_info.append(
                     (node_name, rep_json["data"]["cpu_util"]))
                 all_node_memory_info.append(
                     (node_name, rep_json["data"]["memory_util"]))
                 # get all ssd disk path, then sum
                 storages = db_api.get_node_storage_all(
                     {'node_uuid': node_uuid})
                 disk_ssd = [0, 0]
                 for storage in storages:
                     if storage.type == 1 and storage.path in rep_json[
                             "data"]["disk_util"].keys():  # 1-ssd  2-sata
                         logger.debug(storage.path)
                         disk_ssd[0] += rep_json["data"]["disk_util"][
                             storage.path]["total"]
                         disk_ssd[1] += rep_json["data"]["disk_util"][
                             storage.path]["used"]
                 all_node_disk_info.append(
                     (node_name, '%0.2f' % (disk_ssd[1] / disk_ssd[0] * 100)
                      if disk_ssd[0] else 0, disk_ssd[0], disk_ssd[1]))
                 # just manage network nic, from yzy_node_network_info , yzy_interface_ip is_manage
                 manage_network_name = db_api.get_node_manage_nic_name(
                     node_uuid)
                 logger.debug(manage_network_name)
                 if manage_network_name and manage_network_name in rep_json[
                         "data"]["nic_util"].keys():
                     all_node_nic_info.append(
                         (node_name, rep_json["data"]["nic_util"]
                          [manage_network_name]["sum_bytes_avg"],
                          rep_json["data"]["nic_util"][manage_network_name]
                          ["sum_bytes_max"]))
         resp = get_error_result("Success")
         resp["data"] = {}
         resp["data"]['utc'] = int(
             (dt.datetime.utcnow() -
              dt.datetime.utcfromtimestamp(0)).total_seconds())
         all_node_cpu_info.sort(key=lambda x: float(x[1]), reverse=True)
         resp["data"]["cpu_util"] = all_node_cpu_info[0:5]
         all_node_memory_info.sort(key=lambda x: float(x[1]), reverse=True)
         resp["data"]["memory_util"] = all_node_memory_info[0:5]
         all_node_disk_info.sort(key=lambda x: float(x[1]), reverse=True)
         resp["data"]["disk_util"] = all_node_disk_info[0:5]
         all_node_nic_info.sort(key=lambda x: float(x[1]), reverse=True)
         resp["data"]["nic_util"] = all_node_nic_info[0:5]
         return resp
     except Exception as err:
         logger.error("err {}".format(err))
         logger.error(''.join(traceback.format_exc()))
         return get_error_result("OtherError")