def eject_replica_source(self, context, instance_id): def _eject_replica_source(old_master, replica_models): master_candidate = self._most_current_replica( old_master, replica_models) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) exception_replicas = [] for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError: msg = _("eject-replica-source: Unable to migrate " "replica %(slave)s from old replica source " "%(old_master)s to new source %(new_master)s.") msg_values = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id } LOG.exception(msg % msg_values) exception_replicas.append(replica.id) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.EJECTION_ERROR) msg = _("eject-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") msg_values = { "id": master_candidate.id, "replicas": exception_replicas } raise ReplicationSlaveAttachError(msg % msg_values) with EndNotification(context): master = BuiltInstanceTasks.load(context, instance_id) replicas = [ BuiltInstanceTasks.load(context, dbinfo.id) for dbinfo in master.slaves ] try: _eject_replica_source(master, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([master] + replicas, InstanceTasks.EJECTION_ERROR) raise
def eject_replica_source(self, context, instance_id): def _eject_replica_source(old_master, replica_models): master_candidate = self._most_current_replica(old_master, replica_models) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master(for_failover=True) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) exception_replicas = [] for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError: msg = _("eject-replica-source: Unable to migrate " "replica %(slave)s from old replica source " "%(old_master)s to new source %(new_master)s.") msg_values = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id } LOG.exception(msg % msg_values) exception_replicas.append(replica.id) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.EJECTION_ERROR) msg = _("eject-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") msg_values = { "id": master_candidate.id, "replicas": exception_replicas } raise ReplicationSlaveAttachError(msg % msg_values) with EndNotification(context): master = BuiltInstanceTasks.load(context, instance_id) replicas = [BuiltInstanceTasks.load(context, dbinfo.id) for dbinfo in master.slaves] try: _eject_replica_source(master, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([master] + replicas, InstanceTasks.EJECTION_ERROR) raise
def _create_replication_slave(self, context, instance_id, name, flavor, image_id, databases, users, datastore_manager, packages, volume_size, availability_zone, root_password, nics, overrides, slave_of_id, backup_id, volume_type, modules): if type(instance_id) in [list]: ids = instance_id root_passwords = root_password else: ids = [instance_id] root_passwords = [root_password] replica_number = 0 replica_backup_id = backup_id replica_backup_created = False replicas = [] master_instance_tasks = BuiltInstanceTasks.load(context, slave_of_id) server_group = master_instance_tasks.server_group scheduler_hints = srv_grp.ServerGroup.convert_to_hint(server_group) LOG.debug("Using scheduler hints for locality: %s", scheduler_hints) try: for replica_index in range(0, len(ids)): try: replica_number += 1 LOG.debug("Creating replica %(num)d of %(count)d.", {'num': replica_number, 'count': len(ids)}) instance_tasks = FreshInstanceTasks.load( context, ids[replica_index]) snapshot = instance_tasks.get_replication_master_snapshot( context, slave_of_id, flavor, replica_backup_id, replica_number=replica_number) replica_backup_id = snapshot['dataset']['snapshot_id'] replica_backup_created = (replica_backup_id is not None) instance_tasks.create_instance( flavor, image_id, databases, users, datastore_manager, packages, volume_size, replica_backup_id, availability_zone, root_passwords[replica_index], nics, overrides, None, snapshot, volume_type, modules, scheduler_hints) replicas.append(instance_tasks) except Exception: # if it's the first replica, then we shouldn't continue LOG.exception( "Could not create replica %(num)d of %(count)d.", {'num': replica_number, 'count': len(ids)}) if replica_number == 1: raise for replica in replicas: replica.wait_for_instance(CONF.restore_usage_timeout, flavor) finally: if replica_backup_created: Backup.delete(context, replica_backup_id)
def _create_replication_slave(self, context, instance_id, name, flavor, image_id, databases, users, datastore_manager, packages, volume_size, availability_zone, root_password, nics, overrides, slave_of_id, backup_id, volume_type, modules): if type(instance_id) in [list]: ids = instance_id root_passwords = root_password else: ids = [instance_id] root_passwords = [root_password] replica_number = 0 replica_backup_id = backup_id replica_backup_created = False replicas = [] master_instance_tasks = BuiltInstanceTasks.load(context, slave_of_id) server_group = master_instance_tasks.server_group scheduler_hints = srv_grp.ServerGroup.convert_to_hint(server_group) LOG.debug("Using scheduler hints for locality: %s" % scheduler_hints) try: for replica_index in range(0, len(ids)): try: replica_number += 1 LOG.debug("Creating replica %d of %d." % (replica_number, len(ids))) instance_tasks = FreshInstanceTasks.load( context, ids[replica_index]) snapshot = instance_tasks.get_replication_master_snapshot( context, slave_of_id, flavor, replica_backup_id, replica_number=replica_number) replica_backup_id = snapshot['dataset']['snapshot_id'] replica_backup_created = (replica_backup_id is not None) instance_tasks.create_instance( flavor, image_id, databases, users, datastore_manager, packages, volume_size, replica_backup_id, availability_zone, root_passwords[replica_index], nics, overrides, None, snapshot, volume_type, modules, scheduler_hints) replicas.append(instance_tasks) except Exception: # if it's the first replica, then we shouldn't continue LOG.exception(_( "Could not create replica %(num)d of %(count)d.") % {'num': replica_number, 'count': len(ids)}) if replica_number == 1: raise for replica in replicas: replica.wait_for_instance(CONF.restore_usage_timeout, flavor) finally: if replica_backup_created: Backup.delete(context, replica_backup_id)
def detach_replica(self, context, instance_id): with EndNotification(context): slave = models.BuiltInstanceTasks.load(context, instance_id) master_id = slave.slave_of_id master = models.BuiltInstanceTasks.load(context, master_id) slave.detach_replica(master) if master.post_processing_required_for_replication(): slave_instances = [BuiltInstanceTasks.load( context, slave_model.id) for slave_model in master.slaves] slave_detail = [slave_instance.get_replication_detail() for slave_instance in slave_instances] master.complete_master_setup(slave_detail)
def _get_instance_task(self): dbinst = DBInstance(InstanceTasks.NONE,name = 'name', created ='created', compute_instance_id = 'compute_instance_id', task_id = 'task_id', task_description = 'task_description', task_start_time = 'task_start_time', volume_id = 'volume_id', deleted = 'deleted', tenant_id = 'tenant_id', service_type = 'service_type') server = fake() service_status = fake() service_status.status = ServiceStatuses.RUNNING # inst = BaseInstance(self.context,dbinst,server,service_status) inst = BuiltInstanceTasks(self.context,dbinst,server,service_status) when(inst_models).load_instance(any(), any(), any(), needs_server=any()).thenReturn(inst) instance_tasks = BuiltInstanceTasks.load(self.context,"instance_id") return instance_tasks
def promote_to_replica_source(self, context, instance_id): # TODO(atomic77) Promote and eject need to be able to handle the case # where a datastore like Postgresql needs to treat the slave to be # promoted differently from the old master and the slaves which will # be simply reassigned to a new master. See: # https://bugs.launchpad.net/trove/+bug/1553339 def _promote_to_replica_source(old_master, master_candidate, replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() latest_txn_id = old_master.get_latest_txn_id() master_candidate.wait_for_txn(latest_txn_id) master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() old_master.attach_replica(master_candidate) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) # At this point, should something go wrong, there # should be a working master with some number of working slaves, # and possibly some number of "orphaned" slaves exception_replicas = [] error_messages = "" for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError as ex: msg = (_("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on promote.") % { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id }) LOG.exception(msg) exception_replicas.append(replica) error_messages += "%s (%s)\n" % (msg, ex) try: old_master.demote_replication_master() except Exception as ex: msg = (_("Exception demoting old replica source %s.") % old_master.id) LOG.exception(msg) exception_replicas.append(old_master) error_messages += "%s (%s)\n" % (msg, ex) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.PROMOTION_ERROR) msg = ( _("promote-to-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s:" "\n%(err)s") % { "id": master_candidate.id, "replicas": [repl.id for repl in exception_replicas], "err": error_messages }) raise ReplicationSlaveAttachError(msg) with EndNotification(context): master_candidate = BuiltInstanceTasks.load(context, instance_id) old_master = BuiltInstanceTasks.load(context, master_candidate.slave_of_id) replicas = [] for replica_dbinfo in old_master.slaves: if replica_dbinfo.id == instance_id: replica = master_candidate else: replica = BuiltInstanceTasks.load(context, replica_dbinfo.id) replicas.append(replica) try: _promote_to_replica_source(old_master, master_candidate, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([old_master] + replicas, InstanceTasks.PROMOTION_ERROR) raise
def promote_to_replica_source(self, context, instance_id): def _promote_to_replica_source(old_master, master_candidate, replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() latest_txn_id = old_master.get_latest_txn_id() master_candidate.wait_for_txn(latest_txn_id) master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() old_master.attach_replica(master_candidate) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) # At this point, should something go wrong, there # should be a working master with some number of working slaves, # and possibly some number of "orphaned" slaves exception_replicas = [] for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError: msg = _("promote-to-replica-source: Unable to migrate " "replica %(slave)s from old replica source " "%(old_master)s to new source %(new_master)s.") msg_values = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id } LOG.exception(msg % msg_values) exception_replicas.append(replica) try: old_master.demote_replication_master() except Exception: LOG.exception(_("Exception demoting old replica source")) exception_replicas.append(old_master) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.PROMOTION_ERROR) msg = _("promote-to-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") msg_values = { "id": master_candidate.id, "replicas": exception_replicas } raise ReplicationSlaveAttachError(msg % msg_values) with EndNotification(context): master_candidate = BuiltInstanceTasks.load(context, instance_id) old_master = BuiltInstanceTasks.load(context, master_candidate.slave_of_id) replicas = [] for replica_dbinfo in old_master.slaves: if replica_dbinfo.id == instance_id: replica = master_candidate else: replica = BuiltInstanceTasks.load(context, replica_dbinfo.id) replicas.append(replica) try: _promote_to_replica_source(old_master, master_candidate, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([old_master] + replicas, InstanceTasks.PROMOTION_ERROR) raise
def eject_replica_source(self, context, instance_id): def _eject_replica_source(old_master, replica_models): master_candidate = self._most_current_replica(old_master, replica_models) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() master_candidate.detach_replica(old_master, for_failover=True, for_promote=True) master_candidate.enable_as_master() master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) exception_replicas = [] error_messages = "" for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError as ex: msg = (_("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on eject.") % {"slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id}) LOG.exception(msg) exception_replicas.append(replica) error_messages += "%s (%s)\n" % (msg, ex) if master_candidate.post_processing_required_for_replication(): new_slaves = list(replica_models) new_slaves.remove(master_candidate) new_slaves_detail = [slave.get_replication_detail() for slave in new_slaves] master_candidate.complete_master_setup(new_slaves_detail) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.EJECTION_ERROR) msg = (_("eject-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") % {"id": master_candidate.id, "replicas": [repl.id for repl in exception_replicas]}) raise ReplicationSlaveAttachError("%s:\n%s" % (msg, error_messages)) with EndNotification(context): master = BuiltInstanceTasks.load(context, instance_id) replicas = [BuiltInstanceTasks.load(context, dbinfo.id) for dbinfo in master.slaves] try: _eject_replica_source(master, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([master] + replicas, InstanceTasks.EJECTION_ERROR) raise
def promote_to_replica_source(self, context, instance_id): # TODO(atomic77) Promote and eject need to be able to handle the case # where a datastore like Postgresql needs to treat the slave to be # promoted differently from the old master and the slaves which will # be simply reassigned to a new master. See: # https://bugs.launchpad.net/trove/+bug/1553339 def _promote_to_replica_source(old_master, master_candidate, replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error # NOTE(zhaochao): we cannot reattach the old master to the new # one immediately after the new master is up, because for MariaDB # the other replicas are still connecting to the old master, and # during reattaching the old master as a slave, new GTID may be # created and synced to the replicas. After that, when attaching # the replicas to the new master, 'START SLAVE' will fail by # 'fatal error 1236' if the binlog of the replica diverged from # the new master. So the proper order should be: # -1. make the old master read only (and detach floating ips) # -2. make sure the new master is up-to-date # -3. detach the new master from the old one # -4. enable the new master (and attach floating ips) # -5. attach the other replicas to the new master # -6. attach the old master to the new one # (and attach floating ips) # -7. demote the old master # What we changed here is the order of the 6th step, previously # this step took place right after step 4, which causes failures # with MariaDB replications. old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() latest_txn_id = old_master.get_latest_txn_id() master_candidate.wait_for_txn(latest_txn_id) master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) # At this point, should something go wrong, there # should be a working master with some number of working slaves, # and possibly some number of "orphaned" slaves exception_replicas = [] error_messages = "" for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError as ex: log_fmt = ("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on promote.") exc_fmt = _("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on promote.") msg_content = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id} LOG.exception(log_fmt, msg_content) exception_replicas.append(replica) error_messages += "%s (%s)\n" % ( exc_fmt % msg_content, ex) # dealing with the old master after all the other replicas # has been migrated. old_master.attach_replica(master_candidate) old_master.attach_public_ips(slave_ips) try: old_master.demote_replication_master() except Exception as ex: log_fmt = "Exception demoting old replica source %s." exc_fmt = _("Exception demoting old replica source %s.") LOG.exception(log_fmt, old_master.id) exception_replicas.append(old_master) error_messages += "%s (%s)\n" % ( exc_fmt % old_master.id, ex) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.PROMOTION_ERROR) msg = (_("promote-to-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s:" "\n%(err)s") % {"id": master_candidate.id, "replicas": [repl.id for repl in exception_replicas], "err": error_messages}) raise ReplicationSlaveAttachError(msg) with EndNotification(context): master_candidate = BuiltInstanceTasks.load(context, instance_id) old_master = BuiltInstanceTasks.load(context, master_candidate.slave_of_id) replicas = [] for replica_dbinfo in old_master.slaves: if replica_dbinfo.id == instance_id: replica = master_candidate else: replica = BuiltInstanceTasks.load(context, replica_dbinfo.id) replicas.append(replica) try: _promote_to_replica_source(old_master, master_candidate, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([old_master] + replicas, InstanceTasks.PROMOTION_ERROR) raise
def promote_to_replica_source(self, context, instance_id): # TODO(atomic77) Promote and eject need to be able to handle the case # where a datastore like Postgresql needs to treat the slave to be # promoted differently from the old master and the slaves which will # be simply reassigned to a new master. See: # https://bugs.launchpad.net/trove/+bug/1553339 def _promote_to_replica_source(old_master, master_candidate, replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() latest_txn_id = old_master.get_latest_txn_id() master_candidate.wait_for_txn(latest_txn_id) old_master.pre_replication_demote() master_candidate.detach_replica(old_master, for_failover=True, for_promote=True) master_candidate.enable_as_master() old_master.attach_replica(master_candidate) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) # At this point, should something go wrong, there # should be a working master with some number of working slaves, # and possibly some number of "orphaned" slaves exception_replicas = [] error_messages = "" for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError as ex: msg = (_("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on promote.") % {"slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id}) LOG.exception(msg) exception_replicas.append(replica) error_messages += "%s (%s)\n" % (msg, ex) try: old_master.demote_replication_master() except Exception as ex: msg = (_("Exception demoting old replica source %s.") % old_master.id) LOG.exception(msg) exception_replicas.append(old_master) error_messages += "%s (%s)\n" % (msg, ex) if master_candidate.post_processing_required_for_replication(): new_slaves = list(replica_models) new_slaves.remove(master_candidate) new_slaves.append(old_master) new_slaves_detail = [slave.get_replication_detail() for slave in new_slaves] master_candidate.complete_master_setup(new_slaves_detail) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.PROMOTION_ERROR) msg = (_("promote-to-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") % {"id": master_candidate.id, "replicas": [repl.id for repl in exception_replicas]}) raise ReplicationSlaveAttachError("%s:\n%s" % (msg, error_messages)) with EndNotification(context): master_candidate = BuiltInstanceTasks.load(context, instance_id) old_master = BuiltInstanceTasks.load(context, master_candidate.slave_of_id) replicas = [] for replica_dbinfo in old_master.slaves: if replica_dbinfo.id == instance_id: replica = master_candidate else: replica = BuiltInstanceTasks.load(context, replica_dbinfo.id) replicas.append(replica) try: _promote_to_replica_source(old_master, master_candidate, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([old_master] + replicas, InstanceTasks.PROMOTION_ERROR) raise
def eject_replica_source(self, context, instance_id): def _eject_replica_source(old_master, replica_models): master_candidate = self._most_current_replica(old_master, replica_models) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) exception_replicas = [] error_messages = "" for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError as ex: log_fmt = ("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on eject.") exc_fmt = _("Unable to migrate replica %(slave)s from " "old replica source %(old_master)s to " "new source %(new_master)s on eject.") msg_content = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id} LOG.exception(log_fmt, msg_content) exception_replicas.append(replica) error_messages += "%s (%s)\n" % ( exc_fmt % msg_content, ex) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.EJECTION_ERROR) msg = (_("eject-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s:" "\n%(err)s") % {"id": master_candidate.id, "replicas": [repl.id for repl in exception_replicas], "err": error_messages}) raise ReplicationSlaveAttachError(msg) with EndNotification(context): master = BuiltInstanceTasks.load(context, instance_id) replicas = [BuiltInstanceTasks.load(context, dbinfo.id) for dbinfo in master.slaves] try: _eject_replica_source(master, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([master] + replicas, InstanceTasks.EJECTION_ERROR) raise
def promote_to_replica_source(self, context, instance_id): def _promote_to_replica_source(old_master, master_candidate, replica_models): # First, we transition from the old master to new as quickly as # possible to minimize the scope of unrecoverable error old_master.make_read_only(True) master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() latest_txn_id = old_master.get_latest_txn_id() master_candidate.wait_for_txn(latest_txn_id) master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() old_master.attach_replica(master_candidate) master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) # At this point, should something go wrong, there # should be a working master with some number of working slaves, # and possibly some number of "orphaned" slaves exception_replicas = [] for replica in replica_models: try: replica.wait_for_txn(latest_txn_id) if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError: msg = _("promote-to-replica-source: Unable to migrate " "replica %(slave)s from old replica source " "%(old_master)s to new source %(new_master)s.") msg_values = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id } LOG.exception(msg % msg_values) exception_replicas.append(replica.id) try: old_master.demote_replication_master() except Exception: LOG.exception(_("Exception demoting old replica source")) exception_replicas.append(old_master) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.PROMOTION_ERROR) msg = _("promote-to-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") msg_values = { "id": master_candidate.id, "replicas": exception_replicas } raise ReplicationSlaveAttachError(msg % msg_values) master_candidate = BuiltInstanceTasks.load(context, instance_id) old_master = BuiltInstanceTasks.load(context, master_candidate.slave_of_id) replicas = [] for replica_dbinfo in old_master.slaves: if replica_dbinfo.id == instance_id: replica = master_candidate else: replica = BuiltInstanceTasks.load(context, replica_dbinfo.id) replicas.append(replica) try: _promote_to_replica_source(old_master, master_candidate, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([old_master] + replicas, InstanceTasks.PROMOTION_ERROR) raise
def eject_replica_source(self, context, instance_id): def _eject_replica_source(old_master, replica_models): # Select the slave with the greatest number of transactions to # be the new master. # TODO(mwj): Replace this heuristic with code to store the # site id of the master then use it to determine which slave # has the most recent txn from that master. master_candidate = None max_txn_count = 0 for replica in replica_models: txn_count = replica.get_txn_count() if txn_count > max_txn_count: master_candidate = replica max_txn_count = txn_count master_ips = old_master.detach_public_ips() slave_ips = master_candidate.detach_public_ips() master_candidate.detach_replica(old_master, for_failover=True) master_candidate.enable_as_master() master_candidate.attach_public_ips(master_ips) master_candidate.make_read_only(False) old_master.attach_public_ips(slave_ips) exception_replicas = [] for replica in replica_models: try: if replica.id != master_candidate.id: replica.detach_replica(old_master, for_failover=True) replica.attach_replica(master_candidate) except exception.TroveError: msg = _("eject-replica-source: Unable to migrate " "replica %(slave)s from old replica source " "%(old_master)s to new source %(new_master)s.") msg_values = { "slave": replica.id, "old_master": old_master.id, "new_master": master_candidate.id } LOG.exception(msg % msg_values) exception_replicas.append(replica.id) self._set_task_status([old_master] + replica_models, InstanceTasks.NONE) if exception_replicas: self._set_task_status(exception_replicas, InstanceTasks.EJECTION_ERROR) msg = _("eject-replica-source %(id)s: The following " "replicas may not have been switched: %(replicas)s") msg_values = { "id": master_candidate.id, "replicas": exception_replicas } raise ReplicationSlaveAttachError(msg % msg_values) master = BuiltInstanceTasks.load(context, instance_id) replicas = [BuiltInstanceTasks.load(context, dbinfo.id) for dbinfo in master.slaves] try: _eject_replica_source(master, replicas) except ReplicationSlaveAttachError: raise except Exception: self._set_task_status([master] + replicas, InstanceTasks.EJECTION_ERROR) raise
def _create_replication_slave(self, context, instance_id, name, flavor, image_id, databases, users, datastore_manager, packages, volume_size, availability_zone, root_password, nics, overrides, slave_of_id, backup_id, volume_type, modules): if type(instance_id) in [list]: ids = instance_id root_passwords = root_password else: ids = [instance_id] root_passwords = [root_password] replica_number = 0 replica_backup_id = backup_id replica_backup_created = False replicas = [] master_instance_tasks = BuiltInstanceTasks.load(context, slave_of_id) server_group = master_instance_tasks.server_group scheduler_hints = srv_grp.ServerGroup.convert_to_hint(server_group) LOG.debug("Using scheduler hints for locality: %s" % scheduler_hints) try: for replica_index in range(0, len(ids)): try: replica_number += 1 LOG.debug("Creating replica %d of %d." % (replica_number, len(ids))) instance_tasks = FreshInstanceTasks.load( context, ids[replica_index]) snapshot = instance_tasks.get_replication_master_snapshot( context, slave_of_id, flavor, replica_backup_id, replica_number=replica_number) replica_backup_id = snapshot['dataset']['snapshot_id'] replica_backup_created = (replica_backup_id is not None) instance_tasks.create_instance( flavor, image_id, databases, users, datastore_manager, packages, volume_size, replica_backup_id, availability_zone, root_passwords[replica_index], nics, overrides, None, snapshot, volume_type, modules, scheduler_hints) replicas.append(instance_tasks) except Exception: # if it's the first replica, then we shouldn't continue LOG.exception( _("Could not create replica %(num)d of %(count)d.") % { 'num': replica_number, 'count': len(ids) }) if replica_number == 1: raise for replica in replicas: replica.wait_for_instance(CONF.restore_usage_timeout, flavor) # Some datastores requires completing configuration of replication # nodes with information that is only available after all the # instances has been started. if (master_instance_tasks.post_processing_required_for_replication( )): slave_instances = [ BuiltInstanceTasks.load(context, slave.id) for slave in master_instance_tasks.slaves ] # Collect info from each slave post instance launch slave_detail = [ slave_instance.get_replication_detail() for slave_instance in slave_instances ] # Pass info of all replication nodes to the master for # replication setup completion master_detail = master_instance_tasks.get_replication_detail() master_instance_tasks.complete_master_setup(slave_detail) # Pass info of all replication nodes to each slave for # replication setup completion for slave_instance in slave_instances: slave_instance.complete_slave_setup( master_detail, slave_detail) # Push pending data/transactions from master to slaves master_instance_tasks.sync_data_to_slaves() # Set the status of all slave nodes to ACTIVE for slave_instance in slave_instances: slave_guest = remote.create_guest_client( slave_instance.context, slave_instance.db_info.id, slave_instance.datastore_version.manager) slave_guest.cluster_complete() finally: if replica_backup_created: Backup.delete(context, replica_backup_id)
def _create_replication_slave(self, context, instance_id, name, flavor, image_id, databases, users, datastore_manager, packages, volume_size, availability_zone, root_password, nics, overrides, slave_of_id, backup_id, volume_type, modules): if type(instance_id) in [list]: ids = instance_id root_passwords = root_password else: ids = [instance_id] root_passwords = [root_password] replica_number = 0 replica_backup_id = backup_id replica_backup_created = False replicas = [] master_instance_tasks = BuiltInstanceTasks.load(context, slave_of_id) server_group = master_instance_tasks.server_group scheduler_hints = srv_grp.ServerGroup.convert_to_hint(server_group) LOG.debug("Using scheduler hints for locality: %s" % scheduler_hints) try: for replica_index in range(0, len(ids)): try: replica_number += 1 LOG.debug("Creating replica %d of %d." % (replica_number, len(ids))) instance_tasks = FreshInstanceTasks.load( context, ids[replica_index]) snapshot = instance_tasks.get_replication_master_snapshot( context, slave_of_id, flavor, replica_backup_id, replica_number=replica_number) replica_backup_id = snapshot['dataset']['snapshot_id'] replica_backup_created = (replica_backup_id is not None) instance_tasks.create_instance( flavor, image_id, databases, users, datastore_manager, packages, volume_size, replica_backup_id, availability_zone, root_passwords[replica_index], nics, overrides, None, snapshot, volume_type, modules, scheduler_hints) replicas.append(instance_tasks) except Exception: # if it's the first replica, then we shouldn't continue LOG.exception(_( "Could not create replica %(num)d of %(count)d.") % {'num': replica_number, 'count': len(ids)}) if replica_number == 1: raise for replica in replicas: replica.wait_for_instance(CONF.restore_usage_timeout, flavor) # Some datastores requires completing configuration of replication # nodes with information that is only available after all the # instances has been started. if (master_instance_tasks .post_processing_required_for_replication()): slave_instances = [BuiltInstanceTasks.load(context, slave.id) for slave in master_instance_tasks.slaves] # Collect info from each slave post instance launch slave_detail = [slave_instance.get_replication_detail() for slave_instance in slave_instances] # Pass info of all replication nodes to the master for # replication setup completion master_detail = master_instance_tasks.get_replication_detail() master_instance_tasks.complete_master_setup(slave_detail) # Pass info of all replication nodes to each slave for # replication setup completion for slave_instance in slave_instances: slave_instance.complete_slave_setup(master_detail, slave_detail) # Push pending data/transactions from master to slaves master_instance_tasks.sync_data_to_slaves() # Set the status of all slave nodes to ACTIVE for slave_instance in slave_instances: slave_guest = remote.create_guest_client( slave_instance.context, slave_instance.db_info.id, slave_instance.datastore_version.manager) slave_guest.cluster_complete() finally: if replica_backup_created: Backup.delete(context, replica_backup_id)
def _create_replication_slave(self, context, instance_id, name, flavor, image_id, databases, users, datastore_manager, packages, volume_size, availability_zone, root_password, nics, overrides, slave_of_id, backup_id, volume_type, modules, access=None, ds_version=None): if type(instance_id) in [list]: ids = instance_id root_passwords = root_password else: ids = [instance_id] root_passwords = [root_password] replica_number = 0 replica_backup_id = backup_id replicas = [] master_instance_tasks = BuiltInstanceTasks.load(context, slave_of_id) server_group = master_instance_tasks.server_group scheduler_hints = srv_grp.ServerGroup.convert_to_hint(server_group) LOG.debug("Using scheduler hints %s for creating instance %s", scheduler_hints, instance_id) # Create backup for master snapshot = None try: instance_tasks = FreshInstanceTasks.load(context, ids[0]) snapshot = instance_tasks.get_replication_master_snapshot( context, slave_of_id, flavor, parent_backup_id=replica_backup_id) LOG.info('Snapshot info for creating replica of %s: %s', slave_of_id, snapshot) except Exception as err: LOG.error( 'Failed to get master snapshot info for creating ' 'replica, error: %s', str(err)) if snapshot and snapshot.get('dataset', {}).get('snapshot_id'): backup_id = snapshot['dataset']['snapshot_id'] Backup.delete(context, backup_id) raise # Create replicas using the master backup replica_backup_id = snapshot['dataset']['snapshot_id'] try: for replica_index in range(0, len(ids)): replica_number += 1 LOG.info(f"Creating replica {replica_number} " f"({ids[replica_index]}) of {len(ids)}.") instance_tasks = FreshInstanceTasks.load( context, ids[replica_index]) instance_tasks.create_instance(flavor, image_id, databases, users, datastore_manager, packages, volume_size, replica_backup_id, availability_zone, root_passwords[replica_index], nics, overrides, None, snapshot, volume_type, modules, scheduler_hints, access=access, ds_version=ds_version) replicas.append(instance_tasks) for replica in replicas: replica.wait_for_instance(CONF.restore_usage_timeout, flavor) LOG.info('Replica %s created successfully', replica.id) except Exception as err: LOG.error('Failed to create replica from %s, error: %s', slave_of_id, str(err)) raise finally: Backup.delete(context, replica_backup_id)