def _validate(self, master_id, validated_inst_id): utils.check_server_status(validated_inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False,timeout=300) utils.check_mysql_adminuser(validated_inst_id) utils.check_vip(master_id) vip_id = utils.get_vip_id(validated_inst_id) utils.check_vip(validated_inst_id,vip_id=vip_id) utils.check_rpl_delay(validated_inst_id) master_ip = utils.check_allocate_ip(utils.get_builtin_instance(master_id).server) rr_ip = utils.check_allocate_ip(utils.get_builtin_instance(validated_inst_id).server) utils.check_rpl_consist(master_id, [validated_inst_id],master_ip,[rr_ip]) utils.check_rpl_topo_rr(self.group_id)
def validate(self): utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=600) utils.check_server_status(self.dbslave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=600) rr_items = InstanceGroupItem.get_by_gid_type(utils.get_context(), self.group_id, DBInstanceType.READ_REPLI, deleted = False) slave_ids = [] for rr in rr_items: slave_ids.append(rr.instance_id) slave_ids.append(self.dbslave_id) utils.check_mysql_adminuser(self.inst_id) utils.check_mysql_adminuser(self.dbslave_id) for _id in [self.inst_id, self.dbslave_id]: utils.check_mysql_is_running(self.inst_id) utils.check_vip(self.inst_id, vip_id=self.vip_id) self.backup_id = utils.check_backup(self.group_id) utils.check_backup_status(self.backup_id) utils.check_backup_path(self.backup_id) for slave_id in slave_ids: utils.check_rpl_delay(slave_id) master_inst = utils.get_builtin_instance(self.inst_id) slave_inst = utils.get_builtin_instance(self.dbslave_id) master_ip = utils.check_allocate_ip(master_inst.server) slave_ip = utils.check_allocate_ip(slave_inst.server) LOG.info("master_ip:%s slave_ip:%s" % (master_ip, slave_ip)) utils.check_rpl_consist(self.inst_id, slave_ids, master_ip, [slave_ip]) utils.check_rpl_topo_ha(self.group_id)
def _failover(self, stop_mysqld = False, rm_mysql_data = False): self.inst_id = self._get_rid() utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False,timeout=10) self.vip_id = utils.get_vip_id(self.inst_id) _rr_server = utils.get_builtin_instance(self.inst_id) nova_instance = _rr_server.server self.group_id = _rr_server.db_info.group_id instance_id = self.inst_id ran_count = 56 utils.generate_databases(self.master_id, count = ran_count) ip = utils.check_allocate_ip(nova_instance) if stop_mysqld: utils.stop_mysqld(ip) if rm_mysql_data: utils.mysql_data_lost(ip) utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.SHUTDOWN, deleted=False,timeout=120) _ret = rpc.call(utils.get_context(),"taskmanager", { "method": "failover", "args": {'instance_id':instance_id} } ) utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False,timeout=120) self.master_id = utils.get_instance_id(self.group_id,DBInstanceType.MASTER) self.inst_id = self.inst_id utils.check_generated_databases(self.inst_id, count = ran_count) self.validate()
def __trigger_mysqld_crash(cls, inst_id): inst = test_utils.get_builtin_instance( inst_id) nova_inst = inst.server ip = test_utils.check_allocate_ip(nova_inst) test_utils.stop_mysqld(ip, stop_ga=True)
def _failover_test(self, group_id, trigger_inst_id, do_workload = False, do_prepare = False, mysqld_killed = False, host_rebooted = False, remove_tmp_initsql = False, mysql_data_lost = False, check_vip = False, check_rpl_consist = True, check_binlog_range = False): LOG.info("Doing Failover Test, group_id:%s, instance_id:%s, do_workload:%s, do_prepare:%s." % (group_id, trigger_inst_id, do_workload, do_prepare)) before_group_items = InstanceGroupItem.list_by_gid(test_utils.get_context(), group_id, deleted = False) before_items = set(map(lambda x: x.type + "_" + x.instance_id, before_group_items)) before_instance = test_utils.get_builtin_instance( trigger_inst_id) before_rip = test_utils.check_allocate_ip(before_instance.server) before_origin_instid = before_instance.id rt_before = rt_after = None if check_binlog_range: rt_before = test_utils.get_restorable_time(trigger_inst_id) if do_workload and before_instance.type == DBInstanceType.MASTER: FAILOVERInstance.__run_workload(do_prepare = do_prepare) if remove_tmp_initsql: FAILOVERInstance.__trigger_vm_remove_tmp_sql_file(trigger_inst_id) if mysqld_killed: FAILOVERInstance.__trigger_mysqld_crash(trigger_inst_id) test_utils.check_server_status(trigger_inst_id, expected_task=tasks.InstanceTasks.NONE, type=before_instance.type, expected_svr_status=test_utils.ServiceStatuses.SHUTDOWN, deleted=False, timeout=120) if host_rebooted: FAILOVERInstance.__trigger_host_reboot(trigger_inst_id) # when host-machine rebooted, no guestagent update service's status. # test_utils.check_server_status(trigger_inst_id, expected_task=tasks.InstanceTasks.NONE, # type=before_instance.type, expected_svr_status=test_utils.ServiceStatuses.SHUTDOWN, # deleted=False, timeout=120) if mysql_data_lost: FAILOVERInstance.__trigger_mysql_data_lost(trigger_inst_id) rpc.call(test_utils.get_context(), "taskmanager", {"method": "failover", "args": {'instance_id':before_origin_instid}}, timeout = 3600) ## check vip <--> rip mapping. ## vip should be changed in 10 seconds. if before_instance.type == DBInstanceType.MASTER or before_instance.type == DBInstanceType.READ_REPLI: after_instance = test_utils.get_builtin_instance( trigger_inst_id) after_nova_inst = after_instance.server after_rip = test_utils.check_allocate_ip(after_nova_inst) assert after_instance.vip == before_instance.vip and before_rip != after_rip if before_instance.type == DBInstanceType.MASTER: test_utils.check_server_status(before_instance.id, expected_task = tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status = test_utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) ## check replication topo after_group_items = InstanceGroupItem.list_by_gid(test_utils.get_context(), group_id, deleted = False) after_items = set(map(lambda x: x.type + "_" + x.instance_id, after_group_items)) LOG.info("before " + str(before_items)) LOG.info("after " + str(after_items)) if check_rpl_consist: diff_items = (before_items - after_items) # assert len(diff_items) == 0 assert len(before_group_items) == len(after_group_items), "size of mysql cluster should be the same." for group_item in after_group_items: if group_item.type == DBInstanceType.STANDBY and group_item.instance_id == before_instance.id: item = InstanceGroupItem.get_by_instance_id(test_utils.get_context(), group_item.instance_id, deleted = False) assert item != None continue test_utils.check_server_status(group_item.instance_id, expected_task = tasks.InstanceTasks.NONE, type = group_item.type, expected_svr_status = test_utils.ServiceStatuses.RUNNING, deleted = False, timeout = 120) if check_binlog_range: rt_after = test_utils.get_restorable_time(trigger_inst_id) assert rt_after.end > rt_before.end, (rt_after.end, rt_before.end) time.sleep(60) rt_after2 = test_utils.get_restorable_time(trigger_inst_id) assert rt_after2.end > rt_after.end, (rt_after2.end, rt_after.end)
def __trigger_mysql_data_lost(cls, inst_id): inst = test_utils.get_builtin_instance( inst_id) nova_inst = inst.server ip = test_utils.check_allocate_ip(nova_inst) test_utils.mysql_data_lost(ip)
def __trigger_vm_remove_tmp_sql_file(cls, inst_id): inst = test_utils.get_builtin_instance( inst_id) nova_inst = inst.server ip = test_utils.check_allocate_ip(nova_inst) test_utils.remove_tmp_initsql(ip)
def __trigger_vm_oom(cls, inst_id): inst = test_utils.get_builtin_instance( inst_id) nova_inst = inst.server ip = test_utils.check_allocate_ip(nova_inst) test_utils.reboot_host(ip)
def failover(self, instance_id=None, _strategy=None): self.inst_id = instance_id if not self.inst_id: self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.MASTER) _inst = utils.get_builtin_instance(self.inst_id) self.group_id = _inst.group_id self.dbslave_id = utils.get_instance_id(self.group_id, DBInstanceType.STANDBY) utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) utils.check_server_status(self.dbslave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) self.vip_id = utils.get_vip_id(self.inst_id) strategy = CONF.ha_failover_strategy virtual_instance_id = None if strategy == 'master': _ret = utils.get_builtin_instance(self.inst_id) nova_instance = _ret.server instance_id = _ret.id type = DBInstanceType.MASTER virtual_instance_id = _ret.virtual_instance_id elif strategy == 'standby': _ret = utils.get_builtin_instance(self.dbslave_id) nova_instance = _ret.server instance_id = _ret.id type = DBInstanceType.STANDBY else: raise Exception("not found ha_failover_strategy %s" % strategy) rancount = random.randint(50, 100) utils.generate_databases(self.inst_id, count = rancount) ip = utils.check_allocate_ip(nova_instance) utils.stop_mysqld(ip, stop_ga=True) utils.check_server_status(instance_id, expected_task=utils.tasks.InstanceTasks.NONE, type=type, expected_svr_status=utils.ServiceStatuses.SHUTDOWN, deleted=False, timeout=120) rpc.call(utils.get_context(), "taskmanager", {"method": "failover", "args": {'instance_id':instance_id}}) if strategy == 'master': origin_inst_id = inst_utils.virtual_instid_2_origin_instid(virtual_instance_id) self.inst_id = origin_inst_id utils.check_server_status(origin_inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) new_slave_id = utils.get_instance_id(self.group_id, DBInstanceType.STANDBY) utils.check_server_status(new_slave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) self.dbslave_id = new_slave_id utils.check_generated_databases(self.inst_id, count = rancount) utils.clear_generated_databases(self.dbslave_id, count = rancount) self.validate()