示例#1
0
    def recoveryUnknownNode(self, nodeId, targetState):
        ''' 检测unknown状态的节点,如果是进程问题,则恢复到目标状态,否则是SSH网卡问题,需要人工修复'''
        (rst, sshClient) = self.getSSHClient(nodeId)
        if (not rst):
            return False

        (rst, msg) = sshClient.execute(
            self.sshClients, nodeId, "RECOVERY_UNKNOWN_NODE_TARGETSTATE",
            [config.dbDatanodePaths, config.dbNodePort, targetState])

        if (not rst): return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover standby nodeId %d failed for reason:\n%s" %
                ((nodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover standby nodeId %d succeed:\n" %
                            (nodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "There may be have ssh network card failure in the nodeId %s, it needs manual support"
                % (nodeId + 1))
            return False
示例#2
0
    def saveToFile(self):
        impl = minidom.getDOMImplementation()
        doc = impl.createDocument(None, None, None)
        root = doc.createElement('Config')

        # 每一组信息先创建节点<order>,然后插入到父节点<orderlist>下
        unfinished_operations_comment = doc.createComment(
            " 1: unfinished float ip opearation, 2: unfinished cluster config refresh"
        )
        root.appendChild(unfinished_operations_comment)

        unfinished_operations = doc.createElement('unfinished_operations')
        unfinished_operations_text = doc.createTextNode(
            str(self.__unfinishedOperations)[1:-1])
        unfinished_operations.appendChild(unfinished_operations_text)
        root.appendChild(unfinished_operations)

        float_ip_state_comment = doc.createComment(
            "1: normal primary, 0: normal standby, -1: unclear primary floatip"
        )
        root.appendChild(float_ip_state_comment)

        float_ip_state = doc.createElement('float_ip_state')
        float_ip_state_text = doc.createTextNode(
            str(self.__floatIpStates)[1:-1])
        float_ip_state.appendChild(float_ip_state_text)
        root.appendChild(float_ip_state)

        doc.appendChild(root)

        # 将dom对象写入本地xml文件
        try:
            tmpFileFullName = self.filePath + os.sep + self.fileName + ".tmp"
            tmpFile = open(tmpFileFullName, 'w')
            doc.writexml(tmpFile,
                         addindent="    ",
                         newl='\n',
                         encoding='utf-8')

            if os.path.exists(self.fileFullName + ".bak"):
                os.remove(self.fileFullName + ".bak")

            if os.path.exists(self.fileFullName):
                os.renames(self.fileFullName, self.fileFullName + ".bak")

            os.renames(tmpFileFullName, self.fileFullName)

        except BaseException:
            system_log.error("Save status file failed\n\%s" %
                             traceback.format_exc())
            return False

        return True
示例#3
0
    def getClusterListenipConfig(self):
        (rst, sshClient) = self.getSSHClient(0)
        if (not rst):
            system_log.error(
                "Get SSH Client failed at querying cluster listen ip")
            os._exit(-1)

        exeOut = ""
        for i in range(3):  # 如果失败,尝试3次
            (rst, exeOut) = sshClient.execute(self.sshClients, 0,
                                              "GET_CLUSTER_LISTENIP_CONFIG",
                                              [config.dbDatanodePaths])
            if (rst): break

            system_log.error(
                "Get SSH Client failed at querying cluster listen ip")
            if (i == 2):
                os._exit(-1)

        self.setDbNodeListenIp(exeOut)
        if (len(config.dbNodeIps) != len(self.dbNodeListenIps)):
            system_log.error(
                "failed to get the db cluster listen addresses to check, maybe network error, system exited!"
            )
            os._exit(-1)

        for nodeId in range(len(config.dbNodeIps)):
            if (config.dbNodeIps[nodeId] != self.dbNodeListenIps[nodeId]):
                system_log.error(
                    "by checking, the configed listen ips %s is not same as the queried %s, system exited! "
                    % (config.dbNodeIps, self.dbNodeListenIps))
                os._exit(-1)
        system_log.info("finished getting the db cluster listen addresses!")
示例#4
0
    def parseListenAddressMsg(cls, msg):
        summaryFlag = False
        nodeListenIps = []
        nodeNames = []
        if (msg.startswith('1#')):
            msg = msg[2:]
            for info in msg.split("\n"):
                if (not summaryFlag):
                    matches = re.findall(
                        "Total GUC values:\s+(\d+).+Failed GUC values:\s+(\d+)",
                        info)
                    if (len(matches) == 0):
                        continue
                    elif (int(matches[0][0]) != len(config.dbNodeIps)
                          and int(matches[0][1]) != 0):
                        system_log.error(
                            "Get db_listen_address failure: \n%s " % msg)
                        return (False, [])
                    summaryFlag = True
                else:
                    matches = re.match(".*\[(.*)\]\s*.*\'((?:[0-9,\.\s]*))\'",
                                       info)
                    if (matches == None):
                        continue

                    nodeNames.append(matches.groups()[0])
                    str1 = matches.groups()[1]
                    #去除IP地址间可能存在的空格
                    str2 = "".join(str1.split())
                    ips = str2.split(",")
                    if (config.floatIp in ips):
                        ips.remove(config.floatIp)

                    if (len(ips) != 1):
                        system_log.error(
                            "Get db_listen_address failure: \n%s " % msg)
                        return (False, [])
                    nodeListenIps.append(ips[0])

        system_log.info("dbNodenames: %s" % nodeNames)
        system_log.info("nodeListenIps: %s" % nodeListenIps)
        return (True, nodeNames, nodeListenIps)
示例#5
0
    def readConfig(self, fileFullName):
        self.fileName = os.path.basename(fileFullName)
        self.filePath = os.path.dirname(fileFullName)
        self.fileFullName = fileFullName

        if (not os.path.exists(fileFullName)):
            system_log.info("no status file: %s is existed." % fileFullName)
            return True

        self.hasStatusFile = True
        try:
            domTree = parse(fileFullName)
            rootNode = domTree.documentElement

            unfinished_operation_nodes = rootNode.getElementsByTagName(
                "unfinished_operations")[0].childNodes
            if (len(unfinished_operation_nodes) == 0):
                self.__unfinishedOperations = []
            else:
                unfinished_operations = unfinished_operation_nodes[
                    0].data.split(",")
                arr = list(map(int, unfinished_operations))
                self.__unfinishedOperations = arr

            float_ip_state_nodes = rootNode.getElementsByTagName(
                "float_ip_state")[0].childNodes
            if (len(float_ip_state_nodes) == 0):
                self.__floatIpStates = []
            else:
                float_ip_state = float_ip_state_nodes[0].data.split(",")
                arr = list(map(int, float_ip_state))
                self.__floatIpStates = arr

            system_log.info("load status file: %s succeed." % fileFullName)
            system_log.info(str(self))

            return True
        except BaseException:
            system_log.error(traceback.format_exc())
            system_log.fatal("load status file: %s failed, system exits." %
                             fileFullName)
            return False
示例#6
0
    def recoveryPrimaryNodeBeforeUnaviable(self, oldPrimaryNodeId,
                                           currDbClusterState):
        '''检测故障前主节点是否因为网络故障导致集群状态Unavailable,如果是DB进程故障,则强制拉起'''
        (rst, sshClient) = self.getSSHClient(oldPrimaryNodeId)
        if (not rst):
            return False

        if (currDbClusterState.getNodeState(oldPrimaryNodeId).lower() ==
                "unknown"):
            (rst, msg) = sshClient.execute(
                self.sshClients, oldPrimaryNodeId,
                "CHECK_AND_RECOVERY_PRIMARY_NODE",
                [config.dbDatanodePaths, config.dbNodePort])
        else:
            (rst, msg) = sshClient.execute(self.sshClients, oldPrimaryNodeId,
                                           "FORCE_RECOVERY_PRIMARY_NODE",
                                           [config.dbDatanodePaths])

        if (not rst):
            return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover the primary nodeId %d before unavailable failed for the reason:\n%s"
                % ((oldPrimaryNodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover the nodeId %d to primary succeed." %
                            (oldPrimaryNodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "The database process is existed on nodeId %s, please check its ssh newwork card."
                % (oldPrimaryNodeId + 1))
            return True
        else:
            system_log.fatal(
                "The nodeId %s state is unknown and can not connect to it, it needs manual support"
                % (oldPrimaryNodeId + 1))
            return True
示例#7
0
    def getClusterDbState(self):
        ''' 循环选择集群节点进行检测'''
        nodeSize = len(config.dbNodeIps)
        sshClient = None

        for _ in range(nodeSize):
            self.lastCheckNodeid = (self.lastCheckNodeid + 1) % nodeSize
            (rst, sshClient) = self.getSSHClient(self.lastCheckNodeid)
            if (rst): break

        if (None == sshClient):
            system_log.error("Cannot get SSH Connect, Cluster checking failed")
            return None

        self.doUnfinishedOperations(sshClient, self.lastCheckNodeid)

        system_log.debug("to get db cluster state on nodeId %d" %
                         (self.lastCheckNodeid + 1))
        (rst, cmdOut) = sshClient.execute(self.sshClients,
                                          self.lastCheckNodeid,
                                          "GET_CLUSTER_STATUS")
        if (not rst):
            system_log.error(
                "For ssh client reason, failed to get the cluster state and will try it in the next round"
            )
            return None

        tmpDbCluster = DbCluster()
        rst = tmpDbCluster.buildByQuery(cmdOut[2:])
        if (rst):
            system_log.debug("current db cluster state is: %s" %
                             (str(tmpDbCluster)))
            return tmpDbCluster
        else:
            system_log.error("current db cluster state is: %s" %
                             (str(tmpDbCluster)))
            return None
示例#8
0
    def check(self):
        try:
            self.initDbSSHClients()
            self.getClusterListenipConfig()
            system_log.info("System successfully started.")
            print("System successfully started.")
            firstCheckFlag = True  # 首次测试
            while True:
                start = time.time()

                currDbClusterState = self.getClusterDbState()

                # if(system_log.level == logging.DEBUG):
                #    print("db current state is %s" % str(currDbClusterState))

                if (currDbClusterState != None):

                    if (currDbClusterState.state != "Unavailable"):
                        self.dbClusterBeforeUnaviable = None

                    # 在初始进入unavailable状态时,保留之前的集群状态,以便必要时恢复故障前主机
                    elif (self.dbClusterBeforeUnaviable == None):
                        self.dbClusterBeforeUnaviable = copy.deepcopy(
                            self.lastDbCluster)

                    try:
                        if (currDbClusterState.state == "Normal"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is normal, system does nothing"
                            )
                            pass
                        elif (currDbClusterState.state == "Degraded"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is degraded, system will process it"
                            )
                            self.processStatusDegrade(
                                currDbClusterState,
                                [currDbClusterState.getPrimaryNodeIds()])
                        else:  # Unavailable
                            if (firstCheckFlag):
                                system_log.error(
                                    "when system start, DB cluster state is '%s'. "
                                    "It should be Normal or Degraded and can successfully acquire db nodes' listen_addresses, "
                                    "so system exits." %
                                    str(currDbClusterState))
                                os._exit(1)

                            system_log.debug(
                                "db current state is unavailable, system will process it"
                            )
                            self.processStatusUnavailable(currDbClusterState)
                    except BaseException:
                        system_log.error("%s", traceback.format_exc())
                        traceback.format_exc()

                    #处理状态变化,记录状态日志,将currDbClusterState的变化复制给lastDbCluster,使其保持最新变化
                    self.prcessClusterState(currDbClusterState)
                    if (firstCheckFlag):
                        #确保Primary节点配置有浮动ip,如果没有则进行配置
                        primaryNodeIds = currDbClusterState.getPrimaryNodeIds()
                        self.confirmPrimaryFloatIp(primaryNodeIds[0])

                    firstCheckFlag = False

                stop = time.time()
                wait = int(config.stateCheckPeriod) - (stop - start)
                if (wait > 0):
                    time.sleep(wait)

        finally:
            system_log.info("System stopped.")
            print("System stopped.")
            self.closeDbSSHClients()
        pass