def __check_config_is_right(self): config = KSC_Configuration.load(utils.get_context(), self.config.id) if not config: raise Exception("create config, not write in db") if self.name != config.name or self.description != config.description: raise Exception("create config, name or description bad value") if self.config.config_type != '1': raise Exception("create config, config_type bad value") overrides = KSC_Configuration.get_configuration_overrides(utils.get_context(), self.config.id) diffs = {} for k, v in self.values.iteritems(): if overrides[k] != v and float(overrides[k]) != float(v): diffs[k] = (v, overrides[k]) if len(diffs): raise Exception("create config, values error") # # if overrides != self.values: # raise Exception("create config, values error") datastore_version = DatastoreVersion.load_by_uuid(self.config.datastore_version_id) if not datastore_version: raise Exception("create config, bad value of datastore_version_id") if datastore_version.name != self.datastore_version: raise Exception("create config, bad value of datastore_version_id")
def resize_ha(self, instance_id=None): self.inst_id = instance_id if not self.inst_id: self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.MASTER) _inst = utils.get_instance(self.inst_id, deleted=0) self.group_id = _inst.group_id master = InstanceGroupItem.get_by_gid_type(utils.get_context(), self.group_id, DBInstanceType.MASTER) standby = InstanceGroupItem.get_by_gid_type(utils.get_context(), self.group_id, DBInstanceType.STANDBY) if master: old_master_inst_id = master.instance_id if standby: old_standy_inst_id = standby.instance_id old_master_inst = utils.get_builtin_instance(old_master_inst_id) old_flavor_id = old_master_inst.flavor_id old_virtual_instance_id = old_master_inst.virtual_instance_id self.inst_id = old_master_inst.id if old_flavor_id not in ['1', '2', '3', '4']: raise Exception("It is not support to do resizing based on flavor id: %s, supported flavor_ids should be in (1,2,3,4)" % (old_flavor_id)) flavor = str(int(old_flavor_id) + 1) LOG.info("old flavor : %s, new flavor : %s" % (old_flavor_id, flavor)) utils.resize_flavor_byclient(self.inst_id, flavor) inst_ids = utils.check_resize_status(self.group_id) if len(inst_ids) >= 2: utils.check_rpl_delay(inst_ids[0]) utils.check_rpl_delay(inst_ids[1]) import time time.sleep(60) new_master = utils.get_builtin_instance(old_virtual_instance_id) new_standby = InstanceGroupItem.get_by_gid_type(utils.get_context(), new_master.group_id, DBInstanceType.STANDBY) if new_master.virtual_instance_id == old_virtual_instance_id and \ new_master.id != old_master_inst.id: self.inst_id = new_master.id self.dbslave_id = new_standby.instance_id self.vip_id = utils.get_vip_id(self.inst_id) else: raise Exception("resize for ha failed,new_master.virtual_instance_id %s," " old_virtual_instance_id %s, new_master.id %s," " old_master_inst.id %s, new_standby_id %s," " old_standy_inst_id %s" % ( new_master.virtual_instance_id, old_virtual_instance_id, new_master.id, old_master_inst.id, new_standby.instance_id, old_standy_inst_id)) self.validate() else: raise Exception("the num of instance_id should be equal or greater than two after resize HA ")
def update(self): self.create() configuration_id = self.config.id values = {"myisam_sort_buffer_size":115343360,"join_buffer_size":307200,'sort_buffer_size':327680} valuesJson = json.dumps(values) name = 'update_%s' % self.config.name description = 'update_%s' % self.config.description utils.configuration_update_byclient(configuration_id, valuesJson, name, description) config = KSC_Configuration.load(utils.get_context(), configuration_id) if config == self.config: raise Exception("update config, something is not update, config = %s" % config) override = KSC_Configuration.get_configuration_overrides(utils.get_context(), configuration_id) if values == override: raise Exception("update config, something is not update, config override = %s" % override)
def migrate(self, instance_id=None): self.inst_id = instance_id if not self.inst_id: self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.SINGLE) utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.SINGLE, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False) inst_db_info = utils.get_instance(self.inst_id, deleted= False) old_nova_server_id = inst_db_info.compute_instance_id ran_count = 45 utils.generate_databases(self.inst_id, count = ran_count) _ret = rpc.call(utils.get_context(), "taskmanager", {"method": "migrate", "args": {'migrate_id': inst_db_info.id}}) utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.SINGLE, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False) new_nova_server_id = utils.get_instance(self.inst_id, deleted= False).compute_instance_id assert old_nova_server_id != new_nova_server_id utils.check_generated_databases(self.inst_id, count = ran_count) self.validate()
def patch(self): if not self.config: self.create() configuration_id = self.config.id values = json.dumps({"myisam_sort_buffer_size":116391936,"join_buffer_size":409600}) utils.configuration_patch_byclient(configuration_id, values) override = KSC_Configuration.get_configuration_overrides(utils.get_context(), configuration_id) if values == override: raise Exception("patch config, something is not update, override = %s" % override)
def migrate_when_backup_fail(self): self.inst_id = self._get_rid() utils.check_server_status(self.inst_id, type = DBInstanceType.READ_REPLI) backup = utils.create_backup_byclient(self.inst_id) utils.check_backup_status(backup.id) bk_info = DBBackup.find_by(utils.get_context(), id = backup.id) bk_info.state = 'FAILED' bk_info.save() self.migrate()
def delete(self): self.create() #config_type = "1" #config = utils.get_config_bytenant(config_type) config = self.config utils.configuration_delete_byclient(config.id) is_OK = 0 #check ... try: config = KSC_Configuration.load(utils.get_context(), config.id) except Exception as e: is_OK = 1 try: override = KSC_Configuration.get_configuration_overrides(utils.get_context(), config.id) except Exception as e1: is_OK = is_OK + 1 if is_OK != 2: raise Exception("delete config, no delete success..")
def patch_new(self): if not self.config: self.create_new() configuration_id = self.config.id values = {"myisam_sort_buffer_size":'xxx',"join_buffer_size":'xxx'} # values = json.dumps({"myisam_sort_buffer_size":116391936,"join_buffer_size":409600}) utils.configuration_patch_byclient(configuration_id, json.dumps(values)) override = KSC_Configuration.get_configuration_overrides(utils.get_context(), configuration_id) for k, v in values.iteritems(): if override.get(k) != v: # if values == override: raise Exception("patch config, something is not update, override = %s" % override)
def migrate(self): rr_inst_id = self._get_rid() old_rr_inst = utils.get_builtin_instance(rr_inst_id) old_rr_id = old_rr_inst.id rr_vid = old_rr_inst.virtual_instance_id _ret = rpc.call(utils.get_context(), "taskmanager", {"method": "migrate", "args": {'migrate_id': old_rr_inst.id}}) utils.check_server_status(rr_vid, DBInstanceType.READ_REPLI, InstanceTasks.NONE, utils.ServiceStatuses.RUNNING, timeout = 600) new_rr_inst = utils.get_builtin_instance(rr_vid) assert new_rr_inst.virtual_instance_id == rr_vid and new_rr_inst.id != old_rr_id, (rr_vid, new_rr_inst.id) master_id = utils.get_instance_id_bytenant(DBInstanceType.MASTER) self._validate(master_id, rr_vid)
def force_migrate(self): force_host = "rds_zone_1:rds-control-18-220.ksc.com" self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.SINGLE) _ret = rpc.call(utils.get_context(), "taskmanager", {"method": "migrate", "args": {'migrate_id': self.inst_id, 'host' : force_host }}) self.inst_id = _ret['id'] self.validate() new_inst = utils.get_instance(self.inst_id) nova_id = new_inst.compute_instance_id info = utils.get_nova_server_info(nova_id) if info['OS-EXT-SRV-ATTR:hypervisor_hostname'] != force_host: LOG.error("expected host is %s, finally host is %s" % (force_host, info['OS-EXT-SRV-ATTR:hypervisor_hostname']))
def _failover(self, stop_mysqld = False, rm_mysql_data = False): self.inst_id = self._get_rid() utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False,timeout=10) self.vip_id = utils.get_vip_id(self.inst_id) _rr_server = utils.get_builtin_instance(self.inst_id) nova_instance = _rr_server.server self.group_id = _rr_server.db_info.group_id instance_id = self.inst_id ran_count = 56 utils.generate_databases(self.master_id, count = ran_count) ip = utils.check_allocate_ip(nova_instance) if stop_mysqld: utils.stop_mysqld(ip) if rm_mysql_data: utils.mysql_data_lost(ip) utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.SHUTDOWN, deleted=False,timeout=120) _ret = rpc.call(utils.get_context(),"taskmanager", { "method": "failover", "args": {'instance_id':instance_id} } ) utils.check_server_status(self.inst_id,expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.READ_REPLI, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False,timeout=120) self.master_id = utils.get_instance_id(self.group_id,DBInstanceType.MASTER) self.inst_id = self.inst_id utils.check_generated_databases(self.inst_id, count = ran_count) self.validate()
def validate(self): utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=600) utils.check_server_status(self.dbslave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=600) rr_items = InstanceGroupItem.get_by_gid_type(utils.get_context(), self.group_id, DBInstanceType.READ_REPLI, deleted = False) slave_ids = [] for rr in rr_items: slave_ids.append(rr.instance_id) slave_ids.append(self.dbslave_id) utils.check_mysql_adminuser(self.inst_id) utils.check_mysql_adminuser(self.dbslave_id) for _id in [self.inst_id, self.dbslave_id]: utils.check_mysql_is_running(self.inst_id) utils.check_vip(self.inst_id, vip_id=self.vip_id) self.backup_id = utils.check_backup(self.group_id) utils.check_backup_status(self.backup_id) utils.check_backup_path(self.backup_id) for slave_id in slave_ids: utils.check_rpl_delay(slave_id) master_inst = utils.get_builtin_instance(self.inst_id) slave_inst = utils.get_builtin_instance(self.dbslave_id) master_ip = utils.check_allocate_ip(master_inst.server) slave_ip = utils.check_allocate_ip(slave_inst.server) LOG.info("master_ip:%s slave_ip:%s" % (master_ip, slave_ip)) utils.check_rpl_consist(self.inst_id, slave_ids, master_ip, [slave_ip]) utils.check_rpl_topo_ha(self.group_id)
def _failover_test(self, group_id, trigger_inst_id, do_workload = False, do_prepare = False, mysqld_killed = False, host_rebooted = False, remove_tmp_initsql = False, mysql_data_lost = False, check_vip = False, check_rpl_consist = True, check_binlog_range = False): LOG.info("Doing Failover Test, group_id:%s, instance_id:%s, do_workload:%s, do_prepare:%s." % (group_id, trigger_inst_id, do_workload, do_prepare)) before_group_items = InstanceGroupItem.list_by_gid(test_utils.get_context(), group_id, deleted = False) before_items = set(map(lambda x: x.type + "_" + x.instance_id, before_group_items)) before_instance = test_utils.get_builtin_instance( trigger_inst_id) before_rip = test_utils.check_allocate_ip(before_instance.server) before_origin_instid = before_instance.id rt_before = rt_after = None if check_binlog_range: rt_before = test_utils.get_restorable_time(trigger_inst_id) if do_workload and before_instance.type == DBInstanceType.MASTER: FAILOVERInstance.__run_workload(do_prepare = do_prepare) if remove_tmp_initsql: FAILOVERInstance.__trigger_vm_remove_tmp_sql_file(trigger_inst_id) if mysqld_killed: FAILOVERInstance.__trigger_mysqld_crash(trigger_inst_id) test_utils.check_server_status(trigger_inst_id, expected_task=tasks.InstanceTasks.NONE, type=before_instance.type, expected_svr_status=test_utils.ServiceStatuses.SHUTDOWN, deleted=False, timeout=120) if host_rebooted: FAILOVERInstance.__trigger_host_reboot(trigger_inst_id) # when host-machine rebooted, no guestagent update service's status. # test_utils.check_server_status(trigger_inst_id, expected_task=tasks.InstanceTasks.NONE, # type=before_instance.type, expected_svr_status=test_utils.ServiceStatuses.SHUTDOWN, # deleted=False, timeout=120) if mysql_data_lost: FAILOVERInstance.__trigger_mysql_data_lost(trigger_inst_id) rpc.call(test_utils.get_context(), "taskmanager", {"method": "failover", "args": {'instance_id':before_origin_instid}}, timeout = 3600) ## check vip <--> rip mapping. ## vip should be changed in 10 seconds. if before_instance.type == DBInstanceType.MASTER or before_instance.type == DBInstanceType.READ_REPLI: after_instance = test_utils.get_builtin_instance( trigger_inst_id) after_nova_inst = after_instance.server after_rip = test_utils.check_allocate_ip(after_nova_inst) assert after_instance.vip == before_instance.vip and before_rip != after_rip if before_instance.type == DBInstanceType.MASTER: test_utils.check_server_status(before_instance.id, expected_task = tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status = test_utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) ## check replication topo after_group_items = InstanceGroupItem.list_by_gid(test_utils.get_context(), group_id, deleted = False) after_items = set(map(lambda x: x.type + "_" + x.instance_id, after_group_items)) LOG.info("before " + str(before_items)) LOG.info("after " + str(after_items)) if check_rpl_consist: diff_items = (before_items - after_items) # assert len(diff_items) == 0 assert len(before_group_items) == len(after_group_items), "size of mysql cluster should be the same." for group_item in after_group_items: if group_item.type == DBInstanceType.STANDBY and group_item.instance_id == before_instance.id: item = InstanceGroupItem.get_by_instance_id(test_utils.get_context(), group_item.instance_id, deleted = False) assert item != None continue test_utils.check_server_status(group_item.instance_id, expected_task = tasks.InstanceTasks.NONE, type = group_item.type, expected_svr_status = test_utils.ServiceStatuses.RUNNING, deleted = False, timeout = 120) if check_binlog_range: rt_after = test_utils.get_restorable_time(trigger_inst_id) assert rt_after.end > rt_before.end, (rt_after.end, rt_before.end) time.sleep(60) rt_after2 = test_utils.get_restorable_time(trigger_inst_id) assert rt_after2.end > rt_after.end, (rt_after2.end, rt_after.end)
def failover(id): rpc.call(utils.get_context(), "taskmanager", {"method": "failover", "args": {'instance_id':id}})
def failover(self, instance_id=None, _strategy=None): self.inst_id = instance_id if not self.inst_id: self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.MASTER) _inst = utils.get_builtin_instance(self.inst_id) self.group_id = _inst.group_id self.dbslave_id = utils.get_instance_id(self.group_id, DBInstanceType.STANDBY) utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) utils.check_server_status(self.dbslave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) self.vip_id = utils.get_vip_id(self.inst_id) strategy = CONF.ha_failover_strategy virtual_instance_id = None if strategy == 'master': _ret = utils.get_builtin_instance(self.inst_id) nova_instance = _ret.server instance_id = _ret.id type = DBInstanceType.MASTER virtual_instance_id = _ret.virtual_instance_id elif strategy == 'standby': _ret = utils.get_builtin_instance(self.dbslave_id) nova_instance = _ret.server instance_id = _ret.id type = DBInstanceType.STANDBY else: raise Exception("not found ha_failover_strategy %s" % strategy) rancount = random.randint(50, 100) utils.generate_databases(self.inst_id, count = rancount) ip = utils.check_allocate_ip(nova_instance) utils.stop_mysqld(ip, stop_ga=True) utils.check_server_status(instance_id, expected_task=utils.tasks.InstanceTasks.NONE, type=type, expected_svr_status=utils.ServiceStatuses.SHUTDOWN, deleted=False, timeout=120) rpc.call(utils.get_context(), "taskmanager", {"method": "failover", "args": {'instance_id':instance_id}}) if strategy == 'master': origin_inst_id = inst_utils.virtual_instid_2_origin_instid(virtual_instance_id) self.inst_id = origin_inst_id utils.check_server_status(origin_inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) new_slave_id = utils.get_instance_id(self.group_id, DBInstanceType.STANDBY) utils.check_server_status(new_slave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) self.dbslave_id = new_slave_id utils.check_generated_databases(self.inst_id, count = rancount) utils.clear_generated_databases(self.dbslave_id, count = rancount) self.validate()
def migrate(self, instance_id=None, _strategy=None): self.inst_id = instance_id if not self.inst_id: self.inst_id = utils.get_instance_id_bytenant(DBInstanceType.MASTER) _master_inst = utils.get_instance(id = self.inst_id, deleted = 0) self.group_id = _master_inst.group_id self.dbslave_id = utils.get_instance_id(self.group_id, DBInstanceType.STANDBY) utils.check_server_status(self.inst_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) utils.check_server_status(self.dbslave_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) self.vip_id = utils.get_vip_id(_master_inst.id) virtual_instance_id = None strategy = CONF.migrate_strategy if strategy == 'master': _ret = utils.get_builtin_instance(_master_inst.id) nova_instance = _ret.server instance_id = _master_inst.id type = DBInstanceType.MASTER virtual_instance_id = _ret.virtual_instance_id elif strategy == 'standby': _ret = utils.get_builtin_instance(self.dbslave_id) nova_instance = _ret.server instance_id = self.dbslave_id type = DBInstanceType.STANDBY else: raise Exception("not found migrate_strategy ss%s" % strategy) ran_count = random.randint(50, 100) utils.generate_databases(self.inst_id, count = ran_count) _ret = rpc.call(utils.get_context(), "taskmanager", {"method": "migrate", "args": {'migrate_id':instance_id}}) if strategy == 'master': raw_instance_id = inst_utils.virtual_instid_2_origin_instid(virtual_instance_id) new_server_id = utils.get_builtin_instance(raw_instance_id).server_id utils.check_server_status(raw_instance_id, expected_task=utils.tasks.InstanceTasks.NONE, type=DBInstanceType.MASTER, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout=120) assert new_server_id != nova_instance.id self.inst_id = raw_instance_id elif strategy == "standby": utils.check_server_status(_ret['id'], expected_task=utils.tasks.InstanceTasks.NONE, type = DBInstanceType.STANDBY, expected_svr_status=utils.ServiceStatuses.RUNNING, deleted=False, timeout = 123) self.dbslave_id = _ret['id'] utils.check_generated_databases(self.inst_id, count = ran_count) utils.check_generated_databases(self.dbslave_id, count = ran_count) utils.clear_generated_databases(self.inst_id, count = ran_count) self.validate()