def recoverServiceFail(self, cluster_id, fail_node_name):
        cluster = ClusterManager.getCluster(cluster_id)
        if not cluster:
            logging.error("RecoverManager : cluster not found")
            return
        fail_node = cluster.getNodeByName(fail_node_name)

        port = int(self.config.get("detection", "polling_port"))
        version = int(self.config.get("version", "version"))
        detector = Detector(fail_node, port)
        fail_services = detector.getFailServices()

        status = True
        if "agents" in fail_services:
            status = self.restartDetectionService(fail_node, version)
        else:
            status = self.restartServices(fail_node, fail_services, version)

        if not status:  # restart service fail
            print "start recovery"
            print "fail node is %s" % fail_node.name
            print "start recovery vm"
            self.recoverVM(cluster, fail_node)
            print "end recovery vm"
            return self.recoverNodeByReboot(fail_node)
        else:
            return status  # restart service success
示例#2
0
    def recoverServiceFail(self, cluster_id, fail_node_name):
        """

        :param cluster_id: 
        :param fail_node_name: 
        :return: 
        """
        finish_detect_service_vm = time.time()
        print("detect service fail", finish_detect_service_vm)
        cluster = ClusterManager.getCluster(cluster_id)
        if not cluster:
            logging.error("RecoverManager : cluster not found")
            return
        fail_node = cluster.getNodeByName(fail_node_name)
        port = int(self.config.get("detection", "polling_port"))
        version = int(self.config.get("version", "version"))
        detector = Detector(fail_node, port)
        fail_services = detector.getFailServices()
        if fail_services is None:
            logging.info(
                "get fail service equals to None, abort recover service fail")
            return True
        # status = True
        if "agents" in fail_services:
            status = self.restartDetectionService(fail_node, version)
        else:
            start_restart_service = time.time()
            print("start restart service:", start_restart_service)
            status = self.restartServices(fail_node, fail_services, version)
            # status = False
        if not status:  # restart service fail
            print("start recovery service fail")
            print("fail node is %s" % fail_node.name)
            start_recover_service_vm = time.time()
            print("start recovery vm", start_recover_service_vm)
            self.recoverVMByEvacuate(cluster, fail_node)
            finish_recover_service_vm = time.time()
            print("end recovery vm", finish_recover_service_vm)
            return self.recoverNodeByReboot(fail_node)
            # temp = self.recoverNodeByReboot(fail_node)
            # finish_recover_service_host = time.time()
            # print("end recovery host", finish_recover_service_host)
            # return temp
        else:  # restart service success
            finish_restart_service = time.time()
            print("finish restart service:", finish_restart_service)
            return status  # restart service success