def recover_grastate(self): """Recover UUID and sequence number There is no clean way to recover the database state (i.e. UUID and sequence number). The only viable way to retrieve this information is by parsing the log created from running with wsrep_recover=on. """ logfile = os.path.join(self.datadir, 'wsrep-recovery-%s.log' % uuid4()) self.logger.info("Attempting recovery to %s", logfile) self.reconfigure(wsrep_recovery_log=logfile) self.systemctl_start(self.service) # Service should have stopped immediately after performing # recovery, but force a stop just in case. self.systemctl_stop(self.service) pattern = re.compile(r'^.*Recovered position:\s*(?P<state>\S+)$') with open(logfile, 'rb') as f: try: m = next(m for m in (pattern.match(line) for line in f) if m) except StopIteration: raise ocf.GenericError("Recovery failed: see %s" % logfile) try: state = GaleraState(m.group('state')) except ValueError as e: raise ocf.GenericError("%s: see %s" % (str(e), logfile)) self.logger.info("Recovered %s from %s", state, logfile) os.remove(logfile) return state
def read_grastate(self): """Read state from Galera state file""" raw = {} try: f = open(self.grastate_file, 'rb') except IOError: self.logger.error("Missing state file %s" % self.grastate_file) return None with f: for lineno, line in enumerate(f, start=1): if re.match(r'^\s*(#.*)?$', line): continue m = re.match(r'^\s*(?P<key>\w+):\s*(?P<value>.*?)\s*$', line) if not m: raise ocf.GenericError("Corrupt %s on line %d" % (self.grastate_file, lineno)) raw[m.group('key')] = m.group('value') uuid = raw.get('uuid') seqno = raw.get('seqno') if uuid is None: raise ocf.GenericError("Missing UUID in %s" % self.grastate_file) if seqno is None: raise ocf.GenericError("Missing sequence number in %s" % self.grastate_file) try: state = GaleraState(uuid=uuid, seqno=seqno) except ValueError as e: raise ocf.GenericError("%s in %s" % (str(e), self.grastate_file)) self.logger.info("Found %s in %s", state, self.grastate_file) return state
def service_start(self): """Start slave service""" # Record state parameters (performing recovery if needed) self.state = self.read_grastate() or self.recover_grastate() # Check that UUID matches cluster UUID, if already set if self.cluster_uuid is not None: if self.uuid not in (ZERO_UUID_STRING, self.cluster_uuid): raise ocf.GenericError("UUID does not match cluster UUID")
def rabbitmqctl(*args): """Perform an action via rabbitmqctl""" command = ('rabbitmqctl', ) + args try: output = subprocess.check_output(command, stderr=subprocess.STDOUT) return output.rstrip('\n') except subprocess.CalledProcessError as e: raise ocf.GenericError(e.output or e.returncode)
def action_promote(self): """Promote resource""" # Refuse concurrent bootstrapping of multiple nodes. This can # arise if e.g. all nodes have been demoted (but not stopped) # due to a brief network interruption. if not self.meta_notify_master_unames: bootstrap = self.choose_bootstrap() if bootstrap is None: raise ocf.GenericError("Refusing to promote without bootstrap") if bootstrap != self: raise ocf.GenericError("Refusing concurrent promotion with %s" % bootstrap.node) # Start master service super(BootstrappingAgent, self).action_promote() # Trigger promotion of all remaining nodes, if applicable if self.is_bootstrap: self.trigger_promote_all() return ocf.SUCCESS
def systemctl(self, action, unit=None): """Perform an action via systemctl""" if unit is None: unit = self.service command = ('systemctl', action, unit) try: output = subprocess.check_output(command, stderr=subprocess.STDOUT) return output.rstrip('\n') except subprocess.CalledProcessError as e: raise ocf.GenericError(e.output or e.returncode)
def master_is_running(self): """Check if master service is running""" if not self.systemctl_is_active(self.service): return False output = self.mysql_exec("SHOW STATUS LIKE 'wsrep_local_state'") m = re.match(r'^\s*wsrep_local_state\s+(?P<state>\d+)\s*$', output) if not m: raise ocf.GenericError("Unable to determine state:\n%s" % output) state = int(m.group('state')) if state != WSREP_STATE_SYNCED: self.logger.error("Unexpected local state %d", state) return False return True
def master_start(self): """Start master service""" # Check that UUID matches cluster UUID, if already set if self.cluster_uuid is not None: if self.uuid not in (ZERO_UUID_STRING, self.cluster_uuid): raise ocf.GenericError("UUID does not match cluster UUID") # Delete empty primary component state file, if present self.delete_empty_gvwstate() # Start service (in normal mode) self.logger.info("Beginning at %s", self.state) self.systemctl_start(self.service) # Validate and update recorded state state = self.read_grastate() if state is None: raise ocf.GenericError("Unable to determine state after promotion") if self.uuid not in (ZERO_UUID_STRING, state.uuid): raise ocf.GenericError("UUID changed unexpectedly after promotion") self.state = state # Record cluster UUID if not already set if self.cluster_uuid is None: self.logger.info("Set new cluster UUID %s", self.uuid) self.cluster_uuid = self.uuid
def mysql_exec(self, sql): """Execute SQL statement""" user = pwd.getpwnam(self.user) def preexec(): """Run as specified user""" os.setgid(user.pw_gid) os.setuid(user.pw_uid) command = ('mysql', '-s', '-u', self.user, '-e', sql) try: output = subprocess.check_output(command, preexec_fn=preexec, stderr=subprocess.STDOUT) return output.rstrip('\n') except subprocess.CalledProcessError as e: raise ocf.GenericError(e.output or e.returncode)