def reload_nodes(self, log_all_nodes = False): """ Method: reload_nodes Description: 从数据库中重新记载节点信息 Parameter: log_all_nodes: 是否将所有的节点信息记录日志 Return: 错误码,当前的节点信息 Others: """ # 从DB中读取所有节点的信息 # 返回值: 错误码, 当前node cur_node = None with self.__lock: if self.__mit is None: try: db_file = os.path.join(self.__app_top_path, "data", "sqlite", "cluster.db") self.__mit = ClusterMit(db_file) except: tracelog.exception("reload cluster node failed.") return err_code_mgr.ER_CLUSTER_START_FAILED, None # 加载所有的节点信息 other_nodes_ips = set([node.get_ip() for node in self.__other_nodes]) nodes = self.__mit.get_all_nodes() for node in nodes: if node.ip == self.__cluster_cfg_info.my_inner_ip: cur_node = ClusterNodeInfo(node.ip) if node.is_enable == 0: cur_node.set_enable(False) else: other_nodes_ips.discard(node.ip) ret = self.__add_node(node.ip, node.is_enable) if ret != 0: tracelog.error("add cluster node %s failed." % node.ip) return err_code_mgr.ER_CLUSTER_START_FAILED, None if log_all_nodes is True: tracelog.info("load cluster node: %s" % node.ip) # 删除已经不存在的节点 for node_ip in other_nodes_ips: self.__rmv_node(node_ip) return 0, cur_node
class ClusterThread(threading.Thread): """ Class: ClusterThread Description: 维护集群节点状态的线程 Base: threading.Thread Others: """ def __init__(self, cluster_node): """ Method: __init__ Description: 维护当前集群节点状态的线程 Parameter: cluster_node: ClusterNode的对象实例 Return: Others: """ threading.Thread.__init__(self) self.daemon = True # 跟随主线程一起退出 self.__cluster_node = cluster_node # 用于集群之间通信的callacp实例 self.__callacp_srv = None self.__callacp_client = None # 集群的配置信息 self.__cluster_cfg_info = None # 其他的所有节点 self.__other_nodes = [] # 通过mit获取集群节点的信息 self.__mit = None # 软件安装的根目录 self.__app_top_path = "" self.__lock = threading.RLock() # 发送心跳查询命令的计数器 self.__query_counter = 0 # 当前的角色 self.__role = CLUSTER_ROLE_UNKNOWN # 当前的状态 self.__state = CLUSTER_STATE_STARTING # 是否已经被停止 self.__stoped_event = multiprocessing.Event() # master的信息 self.__mater_node_info = MasterNodeInfo() # 启动的时间 self.__start_time = str(time.time()) def __clear(self): """ Method: __clear Description: 情况内部的数据 Parameter: 无 Return: Others: """ if self.__callacp_srv is not None: self.__callacp_srv.clear() self.__callacp_srv = None if self.__callacp_client is not None: self.__callacp_client.clear() self.__callacp_client = None self.__cluster_cfg_info = None self.__other_nodes = [] if self.__mit is not None: self.__mit.close() self.__mit = None self.__app_top_path = "" def is_master(self): """ Method: is_master Description: 判断当前角色是否是master Parameter: 无 Return: 当前角色是否是master Others: """ with self.__lock: return self.__role == CLUSTER_ROLE_MASTER def is_only_master(self): """ Method: is_only_master Description: 判断当前节点的角色是否是mater,并且仅有master节点没有其他slave节点 Parameter: 无 Return: 当前的角色是否是mater,并且仅有master节点没有其他slave节点 Others: """ with self.__lock: return self.__state == CLUSTER_STATE_ONLY_MASTER def is_slave(self): """ Method: is_slave Description: 判断当前的节点角色是否是slave Parameter: 无 Return: 当前的节点角色是否是slave Others: """ with self.__lock: return self.__role == CLUSTER_ROLE_SLAVE def get_role(self): """ Method: get_role Description: 获取当前节点的角色 Parameter: 无 Return: 当前节点的角色 Others: """ with self.__lock: return self.__role def get_master_ip(self): """ Method: get_master_ip Description: 获取master的ip Parameter: 无 Return: master的ip Others: """ return self.__mater_node_info.get_ip() def get_all_nodes(self): """ Method: get_all_nodes Description: 获取所有的节点信息 Parameter: 无 Return: 所有的节点信息 Others: """ # 获取所有的nodes的信息 with self.__lock: all_nodes = copy.deepcopy(self.__other_nodes) myself = ClusterNodeInfo(self.__cluster_cfg_info.my_inner_ip) myself.set_role(self.__role) myself.set_online(True) all_nodes.append(myself) return all_nodes def rmv_node(self, ip): """ Method: rmv_node Description: 删除指定的节点 Parameter: ip: 指定的节点的ip Return: 错误码,错误信息 Others: """ online_err = (err_code_mgr.ER_CANNOT_RMV_ONLINE_CLUSTER_NODE , err_code_mgr.get_error_msg(err_code_mgr.ER_CANNOT_RMV_ONLINE_CLUSTER_NODE)) with self.__lock: # 只允许删除离线的节点 if ip == self.__cluster_cfg_info.my_inner_ip: return online_err for node in self.__other_nodes: if node.get_ip() == ip and node.is_online(): return online_err # 先删除mit中的信息 ret_code, err_msg = self.__mit.rmv_node(ip) if ret_code == 0: # 删除内存中的信息 self.__rmv_node(ip) tracelog.info("remvoe node %s." % ip) else: tracelog.error("remvoe node %s failed." % ip) return ret_code, err_msg def initial_cluster(self, cluster_cfg_info, app_top_path): """ Method: initial_cluster Description: 初始化集群 Parameter: cluster_cfg_info: 集群的配置信息 app_top_path: 软件安装的根目录 Return: 错误码 Others: """ # out_NIC: 外网网卡 outer Network Interface Card self.__clear() self.__cluster_cfg_info = cluster_cfg_info self.__app_top_path = app_top_path # 启动callacp的服务端和客户端 self.__callacp_srv = pycallacp.CallAcpServer() self.__callacp_client = pycallacp.CallAcpClient() self.__callacp_srv.set_event_handler(ClusterServerEventHandler(self)) self.__callacp_client.set_event_handler(ClusterServerEventHandler(self)) self.__callacp_srv.set_msg_buf_max_num(3) self.__callacp_client.set_msg_buf_max_num(3) my_ip = cluster_cfg_info.my_inner_ip ret_code = self.__callacp_srv.bind(my_ip, CLUSTER_LISTEN_PORT) if ret_code != 0: tracelog.error("cluster: listen on (%s, %d) failed." % ( my_ip , CLUSTER_LISTEN_PORT)) return ret_code else: tracelog.info("cluster: listen on (%s, %d) ok." % ( my_ip , CLUSTER_LISTEN_PORT)) # 重新加载 ret_code, cur_node = self.reload_nodes(True) if ret_code != 0: tracelog.error("load cluster nodes from DB failed. ret:%d" % ret_code) return ret_code # 判断自己是否已经存在于DB中,如果不存在则插入DB if cur_node is None: # 如果当前集群节点已经达到了最大个数,那么就返回失败 if len(self.__other_nodes) >= cluster_cfg_info.max_nodes_num: tracelog.error("The number of cluster nodes has reached the " "maximum(%d)" % cluster_cfg_info.max_nodes_num) return err_code_mgr.ER_CLUSTER_REACH_MAX ret_code = self.__mit.save_node(my_ip, True) if ret_code != 0: tracelog.error("save current nodes to DB failed. ret:%d" % ret_code) return ret_code else: # 判断自己是否被禁用了 if not cur_node.is_enable(): tracelog.error("the current node is disabled, can not start.") return err_code_mgr.ER_CLUSTER_IS_DISABLED # 绑定网卡虚拟ip self.__unbind_virtual_ip(False) return ret_code def stop_cluster(self): """ Method: stop_cluster Description: 停止当前的节点 Parameter: 无 Return: 无 Others: """ # 停止当前节点 self.__stoped_event.set() # 取消ip绑定 if self.is_master(): self.__unbind_virtual_ip(True) def is_node_prior(self, node): """ Method: is_node_prior Description: 判断指定的节点的判决优先级,是否比当前节点的优先级高 Parameter: node: 待比较的节点 Return: 指定的节点的判决优先级,是否比当前节点的优先级高 Others: """ # 节点node的优先级是否比当前节点高 return node.get_ip() < self.__cluster_cfg_info.my_inner_ip def __get_url(self, node_ip): """ Method: __get_url Description: 根据ip获取节点的url Parameter: node_ip: 节点的ip Return: 节点的url Others: """ return "tcp://%s:%d" %(node_ip, CLUSTER_LISTEN_PORT) def __add_node(self, node_ip, is_enable): """ Method: __add_node Description: 增加节点 Parameter: node_ip: 节点的ip is_enable: 是否启用了 Return: 错误码 Others: """ ret = 0 if node_ip == self.__cluster_cfg_info.my_inner_ip: return ret for node in self.__other_nodes: if node.get_ip() == node_ip: break else: node_info = ClusterNodeInfo(node_ip) if is_enable == 0: node_info.set_enable(False) self.__other_nodes.append(node_info) url = self.__get_url(node_ip) ret = self.__callacp_client.new_connect(url) if ret != 0: tracelog.error("new connection to cluster node failed. %s" % url) return ret def __rmv_node(self, node_ip): """ Method: __rmv_node Description: 删除节点 Parameter: node_ip: 节点的ip Return: Others: """ for i, node in enumerate(self.__other_nodes): if node.get_ip() != node_ip: continue self.__other_nodes.pop(i) url = self.__get_url(node_ip) self.__callacp_client.rmv_connect(url) def reload_nodes(self, log_all_nodes = False): """ Method: reload_nodes Description: 从数据库中重新记载节点信息 Parameter: log_all_nodes: 是否将所有的节点信息记录日志 Return: 错误码,当前的节点信息 Others: """ # 从DB中读取所有节点的信息 # 返回值: 错误码, 当前node cur_node = None with self.__lock: if self.__mit is None: try: db_file = os.path.join(self.__app_top_path, "data", "sqlite", "cluster.db") self.__mit = ClusterMit(db_file) except: tracelog.exception("reload cluster node failed.") return err_code_mgr.ER_CLUSTER_START_FAILED, None # 加载所有的节点信息 other_nodes_ips = set([node.get_ip() for node in self.__other_nodes]) nodes = self.__mit.get_all_nodes() for node in nodes: if node.ip == self.__cluster_cfg_info.my_inner_ip: cur_node = ClusterNodeInfo(node.ip) if node.is_enable == 0: cur_node.set_enable(False) else: other_nodes_ips.discard(node.ip) ret = self.__add_node(node.ip, node.is_enable) if ret != 0: tracelog.error("add cluster node %s failed." % node.ip) return err_code_mgr.ER_CLUSTER_START_FAILED, None if log_all_nodes is True: tracelog.info("load cluster node: %s" % node.ip) # 删除已经不存在的节点 for node_ip in other_nodes_ips: self.__rmv_node(node_ip) return 0, cur_node def __when_starting(self): """ Method: __when_starting Description: 当处于启动中的处理函数 Parameter: 无 Return: Others: """ has_other_enable_nodes = False has_other_online_nodes = False # 检查是否有节点返回了应答消息 for node in self.__other_nodes: if not node.is_enable(): continue has_other_enable_nodes = True if node.is_online(): # 以slave方式启动 has_other_online_nodes = True if node.is_role_master() and self.__mater_node_info.get_ip() == "": self.__mater_node_info.update(node.get_ip(), node.get_start_time()) if has_other_online_nodes is True: self.__start_with_slave() return if has_other_enable_nodes is False: # 没有其他可用的节点,以master方式启动 tracelog.info("the current cluster node is the only enabled node.") self.__start_with_master(CLUSTER_STATE_ONLY_MASTER) return # 如果计数器少于CLUSTER_JUDGE_STATE_HAERTBEAT,则继续发送查询命令 # 否则,以master方式启动 if self.__query_counter < CLUSTER_JUDGE_STATE_HAERTBEAT: self.__query_other_node_state() else: tracelog.info("other cluster nodes didn't respond for state query command.") self.__start_with_master(CLUSTER_STATE_ONLY_MASTER) def __when_now_master(self): """ Method: __when_now_master Description: 当处于master的处理函数 Parameter: 无 Return: Others: """ is_any_other_node_online = False with self.__lock: for node in self.__other_nodes: if not node.is_enable(): continue # 检查节点的状态变更 state_change = node.fetch_change_flag() if state_change == CLUSTER_NODE_STATE_CHANGE_ONLINE: tracelog.info("cluster node %s is online" % node.get_ip()) # 监测到节点离线了,通知上层 self.__cluster_node.on_node_online(node.get_ip()) elif state_change == CLUSTER_NODE_STATE_CHANGE_OFFLINE: tracelog.info("cluster node %s is offline" % node.get_ip()) # 监测到节点离线了,通知上层 self.__cluster_node.on_node_offline(node.get_ip()) if not node.is_online(): continue is_any_other_node_online = True if node.is_role_master(): # 如果收到了ip更小的master的查询命令,那么就切换为slave # 注意,这里ip比较是字符串的比较,只要所有节点的算法是一致的就OK if self.is_node_prior(node): tracelog.info("cluster node %s is also master, this node will goto slave" % node.get_ip()) self.__mater_node_info.update(node.get_ip(), node.get_start_time()) self.__switch_to_slave() return else: tracelog.info("cluster node %s is also master, that node will goto slave" % node.get_ip()) else: if self.__state == CLUSTER_STATE_ONLY_MASTER: self.__change_state(CLUSTER_STATE_NORMAL) if is_any_other_node_online is False and self.__state == CLUSTER_STATE_NORMAL: self.__change_state(CLUSTER_STATE_ONLY_MASTER) self.__query_other_node_state() def __when_now_slave(self): """ Method: __when_now_slave Description: 当处于slave的处理函数 Parameter: 无 Return: Others: """ # 没有收到master的查询,并且没有其他节点,或没有收到ip更小的应答 # 那么转为master with self.__lock: for node in self.__other_nodes: if not node.is_enable(): continue if node.is_role_master(): node.check_heartbeat() if node.is_online(): old_master_ip = self.__mater_node_info.get_ip() if self.__mater_node_info.update(node.get_ip(), node.get_start_time()): self.__cluster_node.on_master_change(old_master_ip, self.__mater_node_info.get_ip()) return tracelog.info("the master cluster node is offline.") # 切换为无主状态 self.__change_state(CLUSTER_STATE_NO_MASTER) self.__reset_query_counter(True) self.reload_nodes() def __when_now_no_master(self): """ Method: __when_now_no_master Description: 当处于没有master节点状态的处理函数 Parameter: 无 Return: Others: """ # 当无主状态下,如果没有其他节点,或者没有收到ip更小的应答 # 那么转为master # 是否存在其他在线的节点 has_other_online_node = False # 是否存在优先级更高、并且在线的节点 has_prior_online_node = False self.__query_other_node_state() with self.__lock: for node in self.__other_nodes: if not node.is_enable(): continue if not node.is_online(): continue has_other_online_node = True if node.is_role_master(): tracelog.info("the cluster node %s become to master" % node.get_ip()) self.__mater_node_info.update(node.get_ip(), node.get_start_time()) self.__change_state(CLUSTER_STATE_NORMAL) return if self.is_node_prior(node): has_prior_online_node = True if has_prior_online_node is True: # 等待其他节点成为master return if self.__query_counter >= CLUSTER_JUDGE_STATE_HAERTBEAT: tracelog.info("no higher priority cluster node respond for state query command.") if has_other_online_node: self.__switch_to_master(CLUSTER_STATE_NORMAL) else: self.__switch_to_master(CLUSTER_STATE_ONLY_MASTER) def run(self): """ Method: run Description: 线程的run接口 Parameter: 无 Return: Others: """ self.__reset_query_counter(True) reload_counter = 0 reload_times = 30 while 1: try: if self.__role == CLUSTER_ROLE_UNKNOWN: if self.__state == CLUSTER_STATE_STARTING: self.__when_starting() elif self.is_master(): self.__when_now_master() elif self.is_slave(): if self.__state == CLUSTER_STATE_NO_MASTER: self.__when_now_no_master() else: self.__when_now_slave() except: tracelog.exception("error occur") if self.__stoped_event.wait(2) is True: break reload_counter += 1 if reload_counter == reload_times: self.reload_nodes() if self.is_master(): # 定期尝试绑定ip self.__bind_virtual_ip(False) elif self.is_slave(): # 定期尝试取消绑定ip self.__unbind_virtual_ip(False) reload_counter = 0 self.__clear() tracelog.info("cluster node stoped.") def __bind_virtual_ip(self, write_log): """ Method: __bind_virtual_ip Description: 绑定虚拟ip地址 Parameter: write_log: 当绑定失败时,是否记录日志 Return: 错误码 Others: """ if self.__cluster_cfg_info.virtual_cluster_ip == "": return ret, msg = bind_virtual_ip(self.__cluster_cfg_info.virtual_cluster_ip , self.__cluster_cfg_info.virtual_cluster_mask , self.__cluster_cfg_info.external_NIC) if ret != 0 and write_log: tracelog.error("bind_virtual_ip(%s/%s on %s) failed:%d, %s" % ( self.__cluster_cfg_info.virtual_cluster_ip , self.__cluster_cfg_info.virtual_cluster_mask , self.__cluster_cfg_info.external_NIC , ret , msg)) return ret def __unbind_virtual_ip(self, write_log): """ Method: __unbind_virtual_ip Description: 解除绑定的虚拟ip Parameter: write_log: 解除绑定虚拟ip失败时,是否需要记录日志 Return: 错误码 Others: """ if self.__cluster_cfg_info.virtual_cluster_ip == "": return ret, msg = unbind_virtual_ip(self.__cluster_cfg_info.virtual_cluster_ip , self.__cluster_cfg_info.virtual_cluster_mask , self.__cluster_cfg_info.external_NIC) if ret != 0 and write_log: tracelog.error("unbind_virtual_ip(%s/%s on %s) failed:%d, %s" % ( self.__cluster_cfg_info.virtual_cluster_ip , self.__cluster_cfg_info.virtual_cluster_mask , self.__cluster_cfg_info.external_NIC , ret , msg)) return ret def __start_with_master(self, state): """ Method: __start_with_master Description: 使用master角色启动当前节点 Parameter: state: 状态 Return: Others: """ # 以master启动 self.__role = CLUSTER_ROLE_MASTER self.__state = state self.__mater_node_info.update(self.__cluster_cfg_info.my_inner_ip, self.__start_time) tracelog.info("the current cluster node %s start with master, state:%d." % ( self.__cluster_cfg_info.my_inner_ip , state)) self.__cluster_node.on_start(self.__role, state) # 进入master状态后,重新设置其他节点的状态 self.__reset_query_counter(True) def __start_with_slave(self): """ Method: __start_with_slave Description: 使用slave角色启动当前节点 Parameter: 无 Return: Others: """ # 以slave启动 self.__role = CLUSTER_ROLE_SLAVE self.__state = CLUSTER_STATE_NORMAL self.__mater_node_info.update("", "") tracelog.info("the current cluster node %s start with slave." % self.__cluster_cfg_info.my_inner_ip) self.__cluster_node.on_start(self.__role, self.__state) def __switch_to_master(self, state): """ Method: __switch_to_master Description: 将角色切换到master Parameter: state: 状态 Return: Others: """ # 切换到master old_role = self.__role old_state = self.__state self.__mater_node_info.update(self.__cluster_cfg_info.my_inner_ip, self.__start_time) self.__role = CLUSTER_ROLE_MASTER self.__state = state tracelog.info("the current cluster node %s switch to master. state:%d" % ( self.__cluster_cfg_info.my_inner_ip , state)) ret_code = self.__bind_virtual_ip(True) if ret_code != 0: tracelog.error("bind virtual ip faild. ret_code:%d" % ret_code) self.__cluster_node.on_state_change(old_role, old_state, self.__role, state) # 进入master状态后,重新设置其他节点的状态 self.__reset_query_counter(True) def __switch_to_slave(self): """ Method: __switch_to_slave Description: 将角色切换到slave Parameter: 无 Return: Others: """ # 切换到slave old_role = self.__role old_state = self.__state self.__role = CLUSTER_ROLE_SLAVE self.__state = CLUSTER_STATE_NORMAL tracelog.info("the current cluster node %s switch to slave. state:%d" % ( self.__cluster_cfg_info.my_inner_ip , state)) ret_code = self.__unbind_virtual_ip(True) if ret_code != 0: tracelog.error("unbind virtual ip faild. ret_code:%d" % ret_code) self.__cluster_node.on_state_change(old_role, old_state, self.__role, state) def __change_state(self, state): """ Method: __change_state Description: 切换当前的状态 Parameter: state: Return: Others: """ old_state = self.__state self.__state = state tracelog.info("the current cluster node %s change state:%d" % ( self.__cluster_cfg_info.my_inner_ip , state)) self.__cluster_node.on_state_change(self.__role, old_state, self.__role, state) def __query_other_node_state(self): """ Method: __query_other_node_state Description: 查询其他节点的状态 Parameter: 无 Return: Others: """ req_msg = self.__get_state_msg(cluster_cmd_code.CMD_CLUSTER_QUERY_STATE) with self.__lock: for node in self.__other_nodes: # 发送状态查询命令给节点 url = self.__get_url(node.get_ip()) self.__callacp_client.send(url, req_msg) # 更新节点的心跳计数 node.check_heartbeat() self.__query_counter += 1 def __reset_query_counter(self, set_nodes_to_offline): """ Method: __reset_query_counter Description: 重置心跳查询的计数器 Parameter: set_nodes_to_offline: 是否同时设置节点为离线 Return: Others: """ self.__query_counter = 0 for node in self.__other_nodes: node.reset_heartbeat(set_nodes_to_offline) def __get_state_msg(self, cmd_code): """ Method: __get_state_msg Description: 生成状态应答消息 Parameter: cmd_code: 命令码 Return: Others: """ state = ClusterStateMsg() state.ip = self.__cluster_cfg_info.my_inner_ip state.role = self.get_role() state.start_time = self.__start_time msg = pycallacp.AcpMessage(cmd_code , state.serialize()) return msg def on_query_state(self, url, msg): """ Method: on_query_state Description: "查询状态"的处理接口 Parameter: url: 发送查询者的url(对端的url) msg: 查询消息 Return: Others: """ try_times = 1 with self.__lock: while try_times <= 2: for node in self.__other_nodes: if node.get_ip() == msg.ip: node.on_heartbeat(msg) try_times = 3 break else: if try_times == 2: tracelog.error("the cluster node %s is unknown" % msg.ip) else: # 重新从DB中加载节点信息 tracelog.error("receive state query cmd from unknown " "node:%s now try to reload nodes" % msg.ip) self.reload_nodes() try_times += 1 # 发送应答消息 ack_msg = self.__get_state_msg(cluster_cmd_code.CMD_CLUSTER_ACK_STATE) self.__callacp_srv.send(url, ack_msg) def on_ack_state(self, msg): """ Method: on_ack_state Description: 收到状态查询的应答消息 Parameter: msg: 状态查询的应答消息 Return: Others: """ with self.__lock: for node in self.__other_nodes: if node.get_ip() == msg.ip: node.on_heartbeat(msg) break else: tracelog.error("receive state ack cmd from unknown node:%s" % msg.ip)