def recoveryUnknownNode(self, nodeId, targetState): ''' 检测unknown状态的节点,如果是进程问题,则恢复到目标状态,否则是SSH网卡问题,需要人工修复''' (rst, sshClient) = self.getSSHClient(nodeId) if (not rst): return False (rst, msg) = sshClient.execute( self.sshClients, nodeId, "RECOVERY_UNKNOWN_NODE_TARGETSTATE", [config.dbDatanodePaths, config.dbNodePort, targetState]) if (not rst): return False elif (msg.startswith("0#")): system_log.error( "Recover standby nodeId %d failed for reason:\n%s" % ((nodeId + 1), msg[2:])) return False elif (msg.startswith("1#")): system_log.info("Recover standby nodeId %d succeed:\n" % (nodeId + 1)) return True elif (msg.startswith("2#")): system_log.fatal( "There may be have ssh network card failure in the nodeId %s, it needs manual support" % (nodeId + 1)) return False
def nodeFailover(self, nodeId): (rst, sshClient) = self.getSSHClient(nodeId) '''如果是单网卡,获取连接失败,说明主机故障或网卡故障;如果是双网卡说明监听网卡故障''' if (not rst): system_log.fatal( "Cannot ssh connect candidate primary nodeId %d, need manual support!" % (nodeId + 1)) return False else: (rst, msg) = sshClient.execute( self.sshClients, nodeId, "SET_FLOATIP_FAILOVER", [ self.dbNodeListenIps[nodeId] + ",%s" % config.floatIp, self.lastDbCluster.nodes[nodeId].nodeName, config.dbDatanodePaths ]) if (not rst): return False else: if (msg.startswith('1#')): system_log.info( "the primary node has failed over to nodeId %d and has refreshed the cluster config" % (nodeId + 1)) return True elif (msg.startswith('4#')): system_log.info( "the primary node has failed over to nodeId %d but refresh the cluster config failed " % (nodeId + 1)) context.setUnfinishOper(const.CLUSTER_REFRESH_CONFIG) context.saveToFile() return True else: system_log.info( "Primary failover to nodeId %d failed, need manual support!" % (nodeId + 1)) return False
def closeSSHClient(self, nodeId): sshClient = self.sshClients[nodeId] if (sshClient != None): try: sshClient.close() system_log.info("Close ssh connect for node %d success!" % (nodeId + 1)) except BaseException: system_log.fatal(traceback.format_exc())
def buildStanbyForNode(self, nodeId): (rst, sshClient) = self.getSSHClient(nodeId) if (not rst): return system_log.info("build the nodeId %d to standby" % (nodeId + 1)) (rst, msg) = sshClient.execute(self.sshClients, nodeId, 'BUILD_AS_STANDBY_NODE', [config.dbDatanodePaths, config.dbNodePort]) if (not rst or not msg.startswith("1#")): system_log.fatal( "DB nodeId %d needs manual to recover, the reason is:\n%s" % ((nodeId + 1), msg))
def connect(self): try: self.ssh = paramiko.SSHClient() self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) key = paramiko.RSAKey.from_private_key_file(self.private_key_file) self.ssh.connect( hostname=self.host, port=self.port, username=self.username, pkey=key, timeout=10) return True except BaseException: system_log.fatal(traceback.format_exc()) return False
def buildStandbyForNotLastPrimary(self, primaryNodeIds): primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds( ) if (len(primaryNodeIdBeforeUnaviables) != 1): system_log.fatal( "Can not getting Primary node before Cluster unavailable, DB Cluster needs manual to recover" ) return '''恢复非持续时间最长的Primary为Standby''' needRecoveryNodeids = list( set(primaryNodeIds) - set(primaryNodeIdBeforeUnaviables)) needRecoveryNodeidsOut = [x + 1 for x in needRecoveryNodeids] system_log.info( "the not last primary nodeIds needed to recover to standby are: %s" % needRecoveryNodeidsOut) for nodeId in needRecoveryNodeids: self.buildStanbyForNode(nodeId)
def processStatusUnavailable(self, currDbClusterState): primaryNodeIds = currDbClusterState.getPrimaryNodeIds() if (len(primaryNodeIds) > 1): primaryNodeIdsOut = [x + 1 for x in primaryNodeIds] system_log.info( "There are more than one primary nodeIds, current are %s" % primaryNodeIdsOut) self.buildStandbyForNotLastPrimary(primaryNodeIds) return elif currDbClusterState.existsPendingNode(): system_log.info( "Current DB Cluster exists pending nodes, Wait for it to recover automatically!" ) return primaryNodeIdBeforeUnaviables = self.dbClusterBeforeUnaviable.getPrimaryNodeIds( ) if (len(primaryNodeIdBeforeUnaviables) != 1): system_log.fatal( "Can not getting Primary node before Cluster became unavailable, DB cluster needs manual to recover" ) return '''恢复故障前主节点''' rst = self.recoveryPrimaryNodeBeforeUnaviable( primaryNodeIdBeforeUnaviables[0], currDbClusterState) if (rst): # 如果恢复主节点成功 self.recoveryFaultStandby(currDbClusterState, primaryNodeIdBeforeUnaviables) return system_log.info( "recover the nodeId %s to primary failed, so system will find the candidate primary node and make it to primary." % (primaryNodeIdBeforeUnaviables[0] + 1)) ''' 找到候选主节点进行主备切换 ''' (rst, candidatePrimaryNodeId) = self.getCandidatePrimary() if (not rst): return system_log.info( "the candidate primary nodeId is %d and will fail over to it" % (candidatePrimaryNodeId + 1)) self.clusterFailover(primaryNodeIdBeforeUnaviables[0], candidatePrimaryNodeId, currDbClusterState)
def readConfig(self, fileFullName): self.fileName = os.path.basename(fileFullName) self.filePath = os.path.dirname(fileFullName) self.fileFullName = fileFullName if (not os.path.exists(fileFullName)): system_log.info("no status file: %s is existed." % fileFullName) return True self.hasStatusFile = True try: domTree = parse(fileFullName) rootNode = domTree.documentElement unfinished_operation_nodes = rootNode.getElementsByTagName( "unfinished_operations")[0].childNodes if (len(unfinished_operation_nodes) == 0): self.__unfinishedOperations = [] else: unfinished_operations = unfinished_operation_nodes[ 0].data.split(",") arr = list(map(int, unfinished_operations)) self.__unfinishedOperations = arr float_ip_state_nodes = rootNode.getElementsByTagName( "float_ip_state")[0].childNodes if (len(float_ip_state_nodes) == 0): self.__floatIpStates = [] else: float_ip_state = float_ip_state_nodes[0].data.split(",") arr = list(map(int, float_ip_state)) self.__floatIpStates = arr system_log.info("load status file: %s succeed." % fileFullName) system_log.info(str(self)) return True except BaseException: system_log.error(traceback.format_exc()) system_log.fatal("load status file: %s failed, system exits." % fileFullName) return False
def getSSHClient(self, nodeId): if (self.sshClients[nodeId] != None): return (True, self.sshClients[nodeId]) hostip = config.dbNodeIps[nodeId] username = config.dbUser privateKeyFile = config.gghcPrivateKeyFile sshClient = SSH_Client(hostip, username, privateKeyFile) rst = sshClient.connect() if (rst): self.sshClients[nodeId] = sshClient system_log.info("get ssh connection for node %s succeed!" % (nodeId + 1)) return (True, sshClient) else: self.sshClients[nodeId] = None system_log.fatal( "gets ssh connection for node %s failed, it needs manual support" % (nodeId + 1)) return (False, None)
def execute(self, sshClients, nodeId, execmd, params=[]): revc_str = "" cmdLine = "%s %s" % (config.gghsAgentPath, execmd) for p in params: cmdLine = "%s %s" % (cmdLine, p) system_log.debug("Send request to nodeId %d: \n%s" % ((nodeId + 1), cmdLine)) try: _, stdout, _ = self.ssh.exec_command(cmdLine, timeout=int(config.sshTimeout), get_pty=True) for info in stdout.readlines(): revc_str += info except BaseException: self.close() sshClients[nodeId] = None system_log.fatal("The network card may be failure in the connection of nodeId %d at sending '%s'.\n%s" % ((nodeId + 1), execmd, traceback.format_exc())) return (False, None) system_log.debug("Receive Response from nodeId %d: \n%s" % ((nodeId + 1), revc_str)) return (True, revc_str)
def recoveryPrimaryNodeBeforeUnaviable(self, oldPrimaryNodeId, currDbClusterState): '''检测故障前主节点是否因为网络故障导致集群状态Unavailable,如果是DB进程故障,则强制拉起''' (rst, sshClient) = self.getSSHClient(oldPrimaryNodeId) if (not rst): return False if (currDbClusterState.getNodeState(oldPrimaryNodeId).lower() == "unknown"): (rst, msg) = sshClient.execute( self.sshClients, oldPrimaryNodeId, "CHECK_AND_RECOVERY_PRIMARY_NODE", [config.dbDatanodePaths, config.dbNodePort]) else: (rst, msg) = sshClient.execute(self.sshClients, oldPrimaryNodeId, "FORCE_RECOVERY_PRIMARY_NODE", [config.dbDatanodePaths]) if (not rst): return False elif (msg.startswith("0#")): system_log.error( "Recover the primary nodeId %d before unavailable failed for the reason:\n%s" % ((oldPrimaryNodeId + 1), msg[2:])) return False elif (msg.startswith("1#")): system_log.info("Recover the nodeId %d to primary succeed." % (oldPrimaryNodeId + 1)) return True elif (msg.startswith("2#")): system_log.fatal( "The database process is existed on nodeId %s, please check its ssh newwork card." % (oldPrimaryNodeId + 1)) return True else: system_log.fatal( "The nodeId %s state is unknown and can not connect to it, it needs manual support" % (oldPrimaryNodeId + 1)) return True
def confirmPrimaryFloatIp(self, primaryNodeId): '''检测Primary是否存在浮动IP,如果不存在则进行配置''' (rst, sshClient) = self.getSSHClient(primaryNodeId) if (not rst): system_log.fatal( "Can not get ssh connection to primary nodeId %d to confirm float ip, system exit" % (primaryNodeId + 1)) os._exit(-1) (rst, msg) = sshClient.execute( self.sshClients, primaryNodeId, 'CONFIRM_FLOATIP_NETWORK', [config.floatipEth, config.floatIp, config.dbNodePort]) if (not rst): system_log.fatal( "ssh connect to primary nodeId %d to confirm float ip failed, system exit" % (primaryNodeId + 1)) os._exit(-1) if (msg.startswith('1#')): system_log.info( "successfully confirmed primary node has float ip %s on network card %s" % (config.floatIp, config.floatipEth)) else: system_log.info( "Find primary node has no float ip %s on network card %s or not become effective in postgresql.conf" % (config.floatIp, config.floatipEth)) (rst, msg) = sshClient.execute( self.sshClients, primaryNodeId, 'PRIMARY_ADD_FLOATIP', [ self.dbNodeListenIps[primaryNodeId] + ",%s" % config.floatIp, self.lastDbCluster.nodes[primaryNodeId].nodeName, config.dbDatanodePaths ]) if (not rst): system_log.fatal( "set float ip on primary node %d failed, system exited" % (primaryNodeId + 1)) os._exit(-1) else: system_log.info("set float ip on primary node %d succeed" % (primaryNodeId + 1))
def getCandidatePrimary(self): ''' 获取unavailable前的Stanby节点,如果只有一个,则它是候选主节点;如果有多个,则按照算法进行选择;如果没有进行告警''' standbyNodeIdsBeforeUnaviable = self.dbClusterBeforeUnaviable.getStandbyNodeIds( ) standbyNodesCount = len(standbyNodeIdsBeforeUnaviable) if (standbyNodesCount == 0): system_log.fatal( "DB Cluster '%s' has not standby node before become unavailable," " so can not get candidate primary node. it needs manual support!" % str(self.dbClusterBeforeUnaviable)) return (False, -1) elif (standbyNodesCount == 1): system_log.info( "DB Cluster '%s' has only one standby node before become unavailable, so" "it is the candidate primary node" % str(self.dbClusterBeforeUnaviable)) return (True, standbyNodeIdsBeforeUnaviable[0]) else: # standbyNodesCount > 1 candidateNodeId = -1 candidateNodeTermlsn = () for nodeId in standbyNodeIdsBeforeUnaviable: (rst, sshClient) = self.getSSHClient(nodeId) if (not rst): system_log.fatal( "Cannot query term and lsn from nodeId %d because can not get ssh connect," "\nit needs manual support!" % (nodeId + 1)) return (False, -1) (rst, msg) = sshClient.execute(self.sshClients, nodeId, "QUERY_NODE_TERM_LSN", [config.dbNodePort]) if (not rst): system_log.fatal( "Querying term and lsn from nodeId %d failed for the reason:\n%s" "\nit needs manual support!" % ((nodeId + 1), msg)) elif (msg.startswith('1#')): (rst, termlsn) = Util.parseTermLsn(msg[2:]) if (not rst): system_log.fatal( "Parase term lsn from '%s' in nodeId %d, it needs manual support!" % (msg[2:], (nodeId + 1))) return (False, -1) if (candidateNodeId == -1): candidateNodeId = nodeId candidateNodeTermlsn = termlsn else: if (candidateNodeTermlsn[0] > termlsn[0] or (candidateNodeTermlsn[0] == termlsn[0] and candidateNodeTermlsn[1] > termlsn[1])): candidateNodeId = nodeId candidateNodeTermlsn = termlsn else: # msg.startswith('0#') 或其它原因 system_log.fatal( "Querying term lsn failed in nodeId %d, the reason:\n%s,\nit needs manual support!" % ((nodeId + 1), msg[2:])) system_log.info("get the candidate nodeId %d" % (candidateNodeId + 1)) return (True, candidateNodeId)