def recoverServiceFail(self, cluster_id, fail_node_name): cluster = ClusterManager.getCluster(cluster_id) if not cluster: logging.error("RecoverManager : cluster not found") return fail_node = cluster.getNodeByName(fail_node_name) port = int(self.config.get("detection", "polling_port")) version = int(self.config.get("version", "version")) detector = Detector(fail_node, port) fail_services = detector.getFailServices() status = True if "agents" in fail_services: status = self.restartDetectionService(fail_node, version) else: status = self.restartServices(fail_node, fail_services, version) if not status: # restart service fail print "start recovery" print "fail node is %s" % fail_node.name print "start recovery vm" self.recoverVM(cluster, fail_node) print "end recovery vm" return self.recoverNodeByReboot(fail_node) else: return status # restart service success
def recoverServiceFail(self, cluster_id, fail_node_name): """ :param cluster_id: :param fail_node_name: :return: """ finish_detect_service_vm = time.time() print("detect service fail", finish_detect_service_vm) cluster = ClusterManager.getCluster(cluster_id) if not cluster: logging.error("RecoverManager : cluster not found") return fail_node = cluster.getNodeByName(fail_node_name) port = int(self.config.get("detection", "polling_port")) version = int(self.config.get("version", "version")) detector = Detector(fail_node, port) fail_services = detector.getFailServices() if fail_services is None: logging.info( "get fail service equals to None, abort recover service fail") return True # status = True if "agents" in fail_services: status = self.restartDetectionService(fail_node, version) else: start_restart_service = time.time() print("start restart service:", start_restart_service) status = self.restartServices(fail_node, fail_services, version) # status = False if not status: # restart service fail print("start recovery service fail") print("fail node is %s" % fail_node.name) start_recover_service_vm = time.time() print("start recovery vm", start_recover_service_vm) self.recoverVMByEvacuate(cluster, fail_node) finish_recover_service_vm = time.time() print("end recovery vm", finish_recover_service_vm) return self.recoverNodeByReboot(fail_node) # temp = self.recoverNodeByReboot(fail_node) # finish_recover_service_host = time.time() # print("end recovery host", finish_recover_service_host) # return temp else: # restart service success finish_restart_service = time.time() print("finish restart service:", finish_restart_service) return status # restart service success