Exemplo n.º 1
0
    def recoveryUnknownNode(self, nodeId, targetState):
        ''' 检测unknown状态的节点,如果是进程问题,则恢复到目标状态,否则是SSH网卡问题,需要人工修复'''
        (rst, sshClient) = self.getSSHClient(nodeId)
        if (not rst):
            return False

        (rst, msg) = sshClient.execute(
            self.sshClients, nodeId, "RECOVERY_UNKNOWN_NODE_TARGETSTATE",
            [config.dbDatanodePaths, config.dbNodePort, targetState])

        if (not rst): return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover standby nodeId %d failed for reason:\n%s" %
                ((nodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover standby nodeId %d succeed:\n" %
                            (nodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "There may be have ssh network card failure in the nodeId %s, it needs manual support"
                % (nodeId + 1))
            return False
Exemplo n.º 2
0
    def getClusterListenipConfig(self):
        (rst, sshClient) = self.getSSHClient(0)
        if (not rst):
            system_log.error(
                "Get SSH Client failed at querying cluster listen ip")
            os._exit(-1)

        exeOut = ""
        for i in range(3):  # 如果失败,尝试3次
            (rst, exeOut) = sshClient.execute(self.sshClients, 0,
                                              "GET_CLUSTER_LISTENIP_CONFIG",
                                              [config.dbDatanodePaths])
            if (rst): break

            system_log.error(
                "Get SSH Client failed at querying cluster listen ip")
            if (i == 2):
                os._exit(-1)

        self.setDbNodeListenIp(exeOut)
        if (len(config.dbNodeIps) != len(self.dbNodeListenIps)):
            system_log.error(
                "failed to get the db cluster listen addresses to check, maybe network error, system exited!"
            )
            os._exit(-1)

        for nodeId in range(len(config.dbNodeIps)):
            if (config.dbNodeIps[nodeId] != self.dbNodeListenIps[nodeId]):
                system_log.error(
                    "by checking, the configed listen ips %s is not same as the queried %s, system exited! "
                    % (config.dbNodeIps, self.dbNodeListenIps))
                os._exit(-1)
        system_log.info("finished getting the db cluster listen addresses!")
Exemplo n.º 3
0
    def clusterFailover(self, lastPrimaryNodeId, candidatePrimaryNodeId,
                        currDbClusterState):
        '''
        1)关闭原先主节点浮动IP,并修改监听IP;
        2)开启候选节点浮动IP,修改监听IP,并failover为主节点;
        3)如果修改原主节点浮动IP成功,将原主节点Build为备节点,
        4)将其它故障节点恢复为备节点
        '''
        # 浮动IP清除是否成功不影响下面操作,忽略其返回值
        system_log.info("to clear primary nodeId %d float ip" %
                        (lastPrimaryNodeId + 1))
        clearFloatIpFlag = self.clearNodeFloatIp(lastPrimaryNodeId)

        rstFailover = self.nodeFailover(candidatePrimaryNodeId)
        if (not rstFailover):
            return False

        recoveryNodeIds = [candidatePrimaryNodeId]
        if (not clearFloatIpFlag):
            recoveryNodeIds.append(lastPrimaryNodeId)

        recoveryNodeIdsOut = [x + 1 for x in recoveryNodeIds]
        system_log.info(
            "recovery all fault standby nodes except the nodeId %s" %
            recoveryNodeIdsOut)
        self.recoveryFaultStandby(currDbClusterState, recoveryNodeIds)
Exemplo n.º 4
0
    def clearNodeFloatIp(self, nodeId):
        clearFlag = True

        (rst, sshClient) = self.getSSHClient(nodeId)
        '''如果是单网卡,获取连接失败,说明主机故障或网卡故障;如果是双网卡说明监听网卡故障'''
        if (sshClient == None):
            clearFlag = False
        else:  # 获取连接成功,清除故障主节点浮动IP
            (rst, msg) = sshClient.execute(
                self.sshClients, nodeId, "CLEAR_NODE_FLOATIP", [
                    self.dbNodeListenIps[nodeId],
                    self.lastDbCluster.nodes[nodeId].nodeName,
                    config.dbDatanodePaths
                ])
            if (not rst or not msg.startswith("1#")):
                clearFlag = False

        if (not clearFlag):
            context.setFloatIpState(nodeId, const.STANDBY_UNCLEAR_FLOATIP)
            context.setUnfinishOper(const.CLEAR_FLOAT_IP)
            context.saveToFile()
            system_log.info(
                "Cannot clear faulty primary node: %d float ip, write it to context and save to file"
                % (nodeId + 1))

        return clearFlag
Exemplo n.º 5
0
    def initDbSSHClients(self):
        nodeSize = len(config.dbNodeIps)
        rstssh = []
        for nodeid in range(nodeSize):
            self.getSSHClient(nodeid)
            rstssh.append(self.sshClients[nodeid] != None)

        system_log.info("finished to init ssh clients, the result is: %s",
                        str(rstssh))
Exemplo n.º 6
0
 def closeSSHClient(self, nodeId):
     sshClient = self.sshClients[nodeId]
     if (sshClient != None):
         try:
             sshClient.close()
             system_log.info("Close ssh connect for node %d success!" %
                             (nodeId + 1))
         except BaseException:
             system_log.fatal(traceback.format_exc())
Exemplo n.º 7
0
 def buildStanbyForNode(self, nodeId):
     (rst, sshClient) = self.getSSHClient(nodeId)
     if (not rst):
         return
     system_log.info("build the nodeId %d to standby" % (nodeId + 1))
     (rst,
      msg) = sshClient.execute(self.sshClients, nodeId,
                               'BUILD_AS_STANDBY_NODE',
                               [config.dbDatanodePaths, config.dbNodePort])
     if (not rst or not msg.startswith("1#")):
         system_log.fatal(
             "DB nodeId %d needs manual to recover, the reason is:\n%s" %
             ((nodeId + 1), msg))
Exemplo n.º 8
0
 def buildStandbyForNotLastPrimary(self, primaryNodeIds):
     primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds(
     )
     if (len(primaryNodeIdBeforeUnaviables) != 1):
         system_log.fatal(
             "Can not getting Primary node before Cluster unavailable, DB Cluster needs manual to recover"
         )
         return
     '''恢复非持续时间最长的Primary为Standby'''
     needRecoveryNodeids = list(
         set(primaryNodeIds) - set(primaryNodeIdBeforeUnaviables))
     needRecoveryNodeidsOut = [x + 1 for x in needRecoveryNodeids]
     system_log.info(
         "the not last primary nodeIds needed to recover to standby are: %s"
         % needRecoveryNodeidsOut)
     for nodeId in needRecoveryNodeids:
         self.buildStanbyForNode(nodeId)
Exemplo n.º 9
0
    def parseListenAddressMsg(cls, msg):
        summaryFlag = False
        nodeListenIps = []
        nodeNames = []
        if (msg.startswith('1#')):
            msg = msg[2:]
            for info in msg.split("\n"):
                if (not summaryFlag):
                    matches = re.findall(
                        "Total GUC values:\s+(\d+).+Failed GUC values:\s+(\d+)",
                        info)
                    if (len(matches) == 0):
                        continue
                    elif (int(matches[0][0]) != len(config.dbNodeIps)
                          and int(matches[0][1]) != 0):
                        system_log.error(
                            "Get db_listen_address failure: \n%s " % msg)
                        return (False, [])
                    summaryFlag = True
                else:
                    matches = re.match(".*\[(.*)\]\s*.*\'((?:[0-9,\.\s]*))\'",
                                       info)
                    if (matches == None):
                        continue

                    nodeNames.append(matches.groups()[0])
                    str1 = matches.groups()[1]
                    #去除IP地址间可能存在的空格
                    str2 = "".join(str1.split())
                    ips = str2.split(",")
                    if (config.floatIp in ips):
                        ips.remove(config.floatIp)

                    if (len(ips) != 1):
                        system_log.error(
                            "Get db_listen_address failure: \n%s " % msg)
                        return (False, [])
                    nodeListenIps.append(ips[0])

        system_log.info("dbNodenames: %s" % nodeNames)
        system_log.info("nodeListenIps: %s" % nodeListenIps)
        return (True, nodeNames, nodeListenIps)
Exemplo n.º 10
0
    def getSSHClient(self, nodeId):
        if (self.sshClients[nodeId] != None):
            return (True, self.sshClients[nodeId])

        hostip = config.dbNodeIps[nodeId]
        username = config.dbUser
        privateKeyFile = config.gghcPrivateKeyFile
        sshClient = SSH_Client(hostip, username, privateKeyFile)
        rst = sshClient.connect()
        if (rst):
            self.sshClients[nodeId] = sshClient
            system_log.info("get ssh connection for node %s succeed!" %
                            (nodeId + 1))
            return (True, sshClient)
        else:
            self.sshClients[nodeId] = None
            system_log.fatal(
                "gets ssh connection for node %s failed, it needs manual support"
                % (nodeId + 1))
            return (False, None)
Exemplo n.º 11
0
 def nodeFailover(self, nodeId):
     (rst, sshClient) = self.getSSHClient(nodeId)
     '''如果是单网卡,获取连接失败,说明主机故障或网卡故障;如果是双网卡说明监听网卡故障'''
     if (not rst):
         system_log.fatal(
             "Cannot ssh connect candidate primary nodeId %d, need manual support!"
             % (nodeId + 1))
         return False
     else:
         (rst, msg) = sshClient.execute(
             self.sshClients, nodeId, "SET_FLOATIP_FAILOVER", [
                 self.dbNodeListenIps[nodeId] + ",%s" % config.floatIp,
                 self.lastDbCluster.nodes[nodeId].nodeName,
                 config.dbDatanodePaths
             ])
         if (not rst):
             return False
         else:
             if (msg.startswith('1#')):
                 system_log.info(
                     "the primary node has failed over to nodeId %d and has refreshed the cluster config"
                     % (nodeId + 1))
                 return True
             elif (msg.startswith('4#')):
                 system_log.info(
                     "the primary node has failed over to nodeId %d but refresh the cluster config failed "
                     % (nodeId + 1))
                 context.setUnfinishOper(const.CLUSTER_REFRESH_CONFIG)
                 context.saveToFile()
                 return True
             else:
                 system_log.info(
                     "Primary failover to nodeId %d failed, need manual support!"
                     % (nodeId + 1))
                 return False
Exemplo n.º 12
0
    def recoveryPrimaryNodeBeforeUnaviable(self, oldPrimaryNodeId,
                                           currDbClusterState):
        '''检测故障前主节点是否因为网络故障导致集群状态Unavailable,如果是DB进程故障,则强制拉起'''
        (rst, sshClient) = self.getSSHClient(oldPrimaryNodeId)
        if (not rst):
            return False

        if (currDbClusterState.getNodeState(oldPrimaryNodeId).lower() ==
                "unknown"):
            (rst, msg) = sshClient.execute(
                self.sshClients, oldPrimaryNodeId,
                "CHECK_AND_RECOVERY_PRIMARY_NODE",
                [config.dbDatanodePaths, config.dbNodePort])
        else:
            (rst, msg) = sshClient.execute(self.sshClients, oldPrimaryNodeId,
                                           "FORCE_RECOVERY_PRIMARY_NODE",
                                           [config.dbDatanodePaths])

        if (not rst):
            return False
        elif (msg.startswith("0#")):
            system_log.error(
                "Recover the primary nodeId %d before unavailable failed for the reason:\n%s"
                % ((oldPrimaryNodeId + 1), msg[2:]))
            return False
        elif (msg.startswith("1#")):
            system_log.info("Recover the nodeId %d to primary succeed." %
                            (oldPrimaryNodeId + 1))
            return True
        elif (msg.startswith("2#")):
            system_log.fatal(
                "The database process is existed on nodeId %s, please check its ssh newwork card."
                % (oldPrimaryNodeId + 1))
            return True
        else:
            system_log.fatal(
                "The nodeId %s state is unknown and can not connect to it, it needs manual support"
                % (oldPrimaryNodeId + 1))
            return True
Exemplo n.º 13
0
    def processStatusUnavailable(self, currDbClusterState):

        primaryNodeIds = currDbClusterState.getPrimaryNodeIds()
        if (len(primaryNodeIds) > 1):
            primaryNodeIdsOut = [x + 1 for x in primaryNodeIds]
            system_log.info(
                "There are more than one primary nodeIds, current are %s" %
                primaryNodeIdsOut)
            self.buildStandbyForNotLastPrimary(primaryNodeIds)
            return
        elif currDbClusterState.existsPendingNode():
            system_log.info(
                "Current DB Cluster exists pending nodes, Wait for it to recover automatically!"
            )
            return

        primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds(
        )
        if (len(primaryNodeIdBeforeUnaviables) != 1):
            system_log.fatal(
                "Can not getting Primary node before Cluster became unavailable, DB cluster needs manual to recover"
            )
            return
        '''恢复故障前主节点'''
        rst = self.recoveryPrimaryNodeBeforeUnaviable(
            primaryNodeIdBeforeUnaviables[0], currDbClusterState)
        if (rst):  # 如果恢复主节点成功
            self.recoveryFaultStandby(currDbClusterState,
                                      primaryNodeIdBeforeUnaviables)
            return

        system_log.info(
            "recover the nodeId %s to primary failed, so system will find the candidate primary node and make it to primary."
            % (primaryNodeIdBeforeUnaviables[0] + 1))
        ''' 找到候选主节点进行主备切换 '''
        (rst, candidatePrimaryNodeId) = self.getCandidatePrimary()
        if (not rst):
            return

        system_log.info(
            "the candidate primary nodeId is %d and will fail over to it" %
            (candidatePrimaryNodeId + 1))
        self.clusterFailover(primaryNodeIdBeforeUnaviables[0],
                             candidatePrimaryNodeId, currDbClusterState)
Exemplo n.º 14
0
    def readConfig(self, fileFullName):
        self.fileName = os.path.basename(fileFullName)
        self.filePath = os.path.dirname(fileFullName)
        self.fileFullName = fileFullName

        if (not os.path.exists(fileFullName)):
            system_log.info("no status file: %s is existed." % fileFullName)
            return True

        self.hasStatusFile = True
        try:
            domTree = parse(fileFullName)
            rootNode = domTree.documentElement

            unfinished_operation_nodes = rootNode.getElementsByTagName(
                "unfinished_operations")[0].childNodes
            if (len(unfinished_operation_nodes) == 0):
                self.__unfinishedOperations = []
            else:
                unfinished_operations = unfinished_operation_nodes[
                    0].data.split(",")
                arr = list(map(int, unfinished_operations))
                self.__unfinishedOperations = arr

            float_ip_state_nodes = rootNode.getElementsByTagName(
                "float_ip_state")[0].childNodes
            if (len(float_ip_state_nodes) == 0):
                self.__floatIpStates = []
            else:
                float_ip_state = float_ip_state_nodes[0].data.split(",")
                arr = list(map(int, float_ip_state))
                self.__floatIpStates = arr

            system_log.info("load status file: %s succeed." % fileFullName)
            system_log.info(str(self))

            return True
        except BaseException:
            system_log.error(traceback.format_exc())
            system_log.fatal("load status file: %s failed, system exits." %
                             fileFullName)
            return False
Exemplo n.º 15
0
    def confirmPrimaryFloatIp(self, primaryNodeId):
        '''检测Primary是否存在浮动IP,如果不存在则进行配置'''
        (rst, sshClient) = self.getSSHClient(primaryNodeId)
        if (not rst):
            system_log.fatal(
                "Can not get ssh connection to primary nodeId %d to confirm float ip, system exit"
                % (primaryNodeId + 1))
            os._exit(-1)

        (rst, msg) = sshClient.execute(
            self.sshClients, primaryNodeId, 'CONFIRM_FLOATIP_NETWORK',
            [config.floatipEth, config.floatIp, config.dbNodePort])
        if (not rst):
            system_log.fatal(
                "ssh connect to primary nodeId %d to confirm float ip failed, system exit"
                % (primaryNodeId + 1))
            os._exit(-1)

        if (msg.startswith('1#')):
            system_log.info(
                "successfully confirmed primary node has float ip %s on network card %s"
                % (config.floatIp, config.floatipEth))
        else:
            system_log.info(
                "Find primary node has no float ip %s on network card %s or not become effective in postgresql.conf"
                % (config.floatIp, config.floatipEth))
            (rst, msg) = sshClient.execute(
                self.sshClients, primaryNodeId, 'PRIMARY_ADD_FLOATIP', [
                    self.dbNodeListenIps[primaryNodeId] +
                    ",%s" % config.floatIp,
                    self.lastDbCluster.nodes[primaryNodeId].nodeName,
                    config.dbDatanodePaths
                ])
            if (not rst):
                system_log.fatal(
                    "set float ip on primary node %d failed, system exited" %
                    (primaryNodeId + 1))
                os._exit(-1)
            else:
                system_log.info("set float ip on primary node %d succeed" %
                                (primaryNodeId + 1))
Exemplo n.º 16
0
 def closeDbSSHClients(self):
     nodeSize = len(config.dbNodeIps)
     for nodeId in range(nodeSize):
         self.closeSSHClient(nodeId)
     system_log.info("finished closing all ssh connections!")
Exemplo n.º 17
0
    def doUnfinishedOperations(self, sshClient, nodeId):
        '''
        context.unfinishedOperations操作  1: 主备切换后配置未能刷,  2: 浮动IP处理未完成
        context.float_ip_state: 1: Primary正常启动floatip,0: Standby正常清除floatip,-1: 故障主节点未能清除floatip 
        '''
        if (not context.hasUnfinishOpers()):
            system_log.debug("context has no unfinished operations.")
            return

        system_log.debug("to do unfinished operations")

        stateModFlag = False  # context是否变化
        floatIpAllModifyFlag = True  # 浮动IP是否存在修改失败

        #主备切换后配置未能刷新配置
        if context.needDBRefreshConf():
            system_log.debug("need to refresh db cluster config")
            (rst, rsp) = sshClient.execute(self.sshClients, nodeId,
                                           "CLUSTER_REFRESH_CONFIG")
            if (not rst or not Util.parseRefreshClusterConfMsg(rsp)):
                system_log.info(
                    "Cluster config refreshing failed, and will try again in the next round"
                )
            else:
                system_log.info("Cluster config refreshing success")
                context.removeUnfinishOper(const.CLUSTER_REFRESH_CONFIG)
                stateModFlag = True
        # 浮动IP处理
        elif (context.needClearFloatIp()):
            system_log.debug("need to clear standby node float ip")
            dbNodesSize = len(self.dbNodeListenIps)
            for idx in range(dbNodesSize):
                if (context.getFloatIpState(
                        idx) == const.STANDBY_UNCLEAR_FLOATIP
                    ):  # 说明该节点浮动IP没有消除成功,需获取其sshClient进行
                    (rst, sshClientTmp) = self.getSSHClient(idx)
                    if (not rst):
                        system_log.debug(
                            "Cannot get ssh connection for nodeId %d, will try delete the float ip in the next round"
                            % (idx + 1))
                        floatIpAllModifyFlag = False
                        continue

                    (rst, rsp) = sshClientTmp.execute(
                        self.sshClients, idx, "CLEAR_NODE_FLOATIP_BUILD", [
                            self.dbNodeListenIps[idx], self.dbNodeNames[idx],
                            config.dbDatanodePaths
                        ])
                    if (not rst or not Util.parseRefreshClusterConfMsg(rsp)):
                        system_log.info(
                            "float ip clear failed on nodeId %d, will try it in the next round"
                            % (idx + 1))
                        floatIpAllModifyFlag = False
                    else:
                        system_log.info("float ip clear succeed on nodeId %d" %
                                        (idx + 1))
                        context.setFloatIpState(idx, const.FLOATIP_NORMAL)
                        stateModFlag = True

            if (floatIpAllModifyFlag):
                context.removeUnfinishOper(const.CLEAR_FLOAT_IP)
                system_log.info(
                    "DB cluster's float ip on non primary nodes written in status file are all cleared."
                )

        if (stateModFlag):
            context.saveToFile()
            system_log.debug("context save to file.")
Exemplo n.º 18
0
    def check(self):
        try:
            self.initDbSSHClients()
            self.getClusterListenipConfig()
            system_log.info("System successfully started.")
            print("System successfully started.")
            firstCheckFlag = True  # 首次测试
            while True:
                start = time.time()

                currDbClusterState = self.getClusterDbState()

                # if(system_log.level == logging.DEBUG):
                #    print("db current state is %s" % str(currDbClusterState))

                if (currDbClusterState != None):

                    if (currDbClusterState.state != "Unavailable"):
                        self.dbClusterBeforeUnaviable = None

                    # 在初始进入unavailable状态时,保留之前的集群状态,以便必要时恢复故障前主机
                    elif (self.dbClusterBeforeUnaviable == None):
                        self.dbClusterBeforeUnaviable = copy.deepcopy(
                            self.lastDbCluster)

                    try:
                        if (currDbClusterState.state == "Normal"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is normal, system does nothing"
                            )
                            pass
                        elif (currDbClusterState.state == "Degraded"):
                            self.dbClusterBeforeUnaviable = None
                            system_log.debug(
                                "db current state is degraded, system will process it"
                            )
                            self.processStatusDegrade(
                                currDbClusterState,
                                [currDbClusterState.getPrimaryNodeIds()])
                        else:  # Unavailable
                            if (firstCheckFlag):
                                system_log.error(
                                    "when system start, DB cluster state is '%s'. "
                                    "It should be Normal or Degraded and can successfully acquire db nodes' listen_addresses, "
                                    "so system exits." %
                                    str(currDbClusterState))
                                os._exit(1)

                            system_log.debug(
                                "db current state is unavailable, system will process it"
                            )
                            self.processStatusUnavailable(currDbClusterState)
                    except BaseException:
                        system_log.error("%s", traceback.format_exc())
                        traceback.format_exc()

                    #处理状态变化,记录状态日志,将currDbClusterState的变化复制给lastDbCluster,使其保持最新变化
                    self.prcessClusterState(currDbClusterState)
                    if (firstCheckFlag):
                        #确保Primary节点配置有浮动ip,如果没有则进行配置
                        primaryNodeIds = currDbClusterState.getPrimaryNodeIds()
                        self.confirmPrimaryFloatIp(primaryNodeIds[0])

                    firstCheckFlag = False

                stop = time.time()
                wait = int(config.stateCheckPeriod) - (stop - start)
                if (wait > 0):
                    time.sleep(wait)

        finally:
            system_log.info("System stopped.")
            print("System stopped.")
            self.closeDbSSHClients()
        pass
Exemplo n.º 19
0
    def recoveryFaultStandby(self, currDbClusterState, exceptNodeIds=[]):
        for tmpNodeid in range(len(currDbClusterState.nodes)):
            if (tmpNodeid in exceptNodeIds): continue

            node = currDbClusterState.nodes[tmpNodeid]
            if (node.state == "Pending"):
                continue
            elif (node.state == "Standby"):
                if (node.subState == "Normal"):
                    continue
                elif (node.subState == "Need repair"):
                    if (node.supplementInfo.startswith("WAL")):
                        system_log.info(
                            "the node is in state '%s', build it to standby" %
                            str(node))
                        self.buildStanbyForNode(tmpNodeid)
                    else:
                        system_log.debug(
                            "the node is in state '%s', wait for it to recover automatically"
                            % str(node))
                        continue
                elif (node.subState == "Coredump"
                      or node.subState == "Unknown"):
                    system_log.info(
                        "the node is in state '%s', build it to standby" %
                        str(node))
                    self.buildStanbyForNode(tmpNodeid)
                else:
                    system_log.info(
                        "the node is in state '%s', wait for it to recover automatically"
                        % str(node))
            elif (node.state == "Normal" or node.state == "Down"
                  or node.state == "Manually stopped"
                  or node.state == "Abnormal"):
                system_log.info(
                    "the node is in state '%s', build it to standby" %
                    str(node))
                self.buildStanbyForNode(tmpNodeid)
            elif (node.state == "Unknown"):
                system_log.info(
                    "the node is in state '%s', build it to standby" %
                    str(node))
                self.recoveryUnknownNode(tmpNodeid, 'standby')
            elif (node.state == "Primary" and node.subState == "Normal"):
                continue
            else:
                system_log.info(
                    "the node is in state '%s', wait for it to recover automatically"
                    % str(node))
Exemplo n.º 20
0
    def getCandidatePrimary(self):
        ''' 获取unavailable前的Stanby节点,如果只有一个,则它是候选主节点;如果有多个,则按照算法进行选择;如果没有进行告警'''
        standbyNodeIdsBeforeUnaviable = self.dbClusterBeforeUnaviable.getStandbyNodeIds(
        )
        standbyNodesCount = len(standbyNodeIdsBeforeUnaviable)

        if (standbyNodesCount == 0):
            system_log.fatal(
                "DB Cluster '%s' has not standby node before become unavailable,"
                " so can not get candidate primary node. it needs manual support!"
                % str(self.dbClusterBeforeUnaviable))
            return (False, -1)
        elif (standbyNodesCount == 1):
            system_log.info(
                "DB Cluster '%s' has only one standby node before become unavailable, so"
                "it is the candidate primary node" %
                str(self.dbClusterBeforeUnaviable))
            return (True, standbyNodeIdsBeforeUnaviable[0])
        else:  # standbyNodesCount > 1
            candidateNodeId = -1
            candidateNodeTermlsn = ()
            for nodeId in standbyNodeIdsBeforeUnaviable:
                (rst, sshClient) = self.getSSHClient(nodeId)
                if (not rst):
                    system_log.fatal(
                        "Cannot query term and lsn from nodeId %d because can not get ssh connect,"
                        "\nit needs manual support!" % (nodeId + 1))
                    return (False, -1)

                (rst, msg) = sshClient.execute(self.sshClients, nodeId,
                                               "QUERY_NODE_TERM_LSN",
                                               [config.dbNodePort])
                if (not rst):
                    system_log.fatal(
                        "Querying term and lsn from nodeId %d failed for the reason:\n%s"
                        "\nit needs manual support!" % ((nodeId + 1), msg))
                elif (msg.startswith('1#')):
                    (rst, termlsn) = Util.parseTermLsn(msg[2:])
                    if (not rst):
                        system_log.fatal(
                            "Parase term lsn from '%s' in nodeId %d, it needs manual support!"
                            % (msg[2:], (nodeId + 1)))
                        return (False, -1)

                    if (candidateNodeId == -1):
                        candidateNodeId = nodeId
                        candidateNodeTermlsn = termlsn
                    else:
                        if (candidateNodeTermlsn[0] > termlsn[0]
                                or (candidateNodeTermlsn[0] == termlsn[0]
                                    and candidateNodeTermlsn[1] > termlsn[1])):
                            candidateNodeId = nodeId
                            candidateNodeTermlsn = termlsn
                else:  # msg.startswith('0#') 或其它原因
                    system_log.fatal(
                        "Querying term lsn failed in nodeId %d, the reason:\n%s,\nit needs manual support!"
                        % ((nodeId + 1), msg[2:]))

            system_log.info("get the candidate nodeId %d" %
                            (candidateNodeId + 1))
            return (True, candidateNodeId)