def teardown(self): self._strobe_api_kill() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"terminating kvelldb on {node_id}").with_time()) node.kill() for node_id in self.nodes: node = self.nodes[node_id] if node.is_service_running(): chaos_event_log.info( m(f"kvelldb on {node_id} is still running").with_time()) raise Exception(f"kvelldb on {node_id} is still running") for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"umount data dir on {node_id}").with_time()) node.umount() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info(m(f"removing data on {node_id}").with_time()) node.wipe_out()
def recover(self): chaos_event_log.info( m(f"removing disk fault injection & restarting a service on {self.node.node_id}" ).with_time()) self.node.io_recover() self.node.kill() self.node.start_service() attempts = 4 while True: attempts -= 1 time.sleep(5) if not self.node.is_service_running(): chaos_event_log.info( m(f"a service on {self.node.node_id} isn't running"). with_time()) if attempts < 0: raise Exception( f"can't start a service on {self.node.node_id}") else: continue break self.workload.availability_logger.log_recovery( f"disk fault injection removed & a service on {self.node.node_id} restarted" ) chaos_event_log.info( m(f"disk fault injection removed & a service on {self.node.node_id} restarted" ).with_time())
def teardown(self): self._kill_api() self._strobe_api_kill() self._rm_api_log() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"terminating redpanda on {node_id}").with_time()) node.kill() # TODO: add several checks (kill -9 may be async) for node_id in self.nodes: node = self.nodes[node_id] if node.is_service_running(): chaos_event_log.info( m(f"redpanda on {node_id} is still running").with_time()) raise Exception(f"redpanda on {node_id} is still running") for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"umount data dir on {node_id}").with_time()) node.umount() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info(m(f"removing data on {node_id}").with_time()) node.wipe_out()
def recover(self): chaos_event_log.info( m(f"removing disk delay on {self.node.node_id}").with_time()) self.node.io_recover() self.workload.availability_logger.log_recovery( f"disk delay on {self.node.node_id} removed") chaos_event_log.info( m(f"disk delay on {self.node.node_id} removed").with_time())
def recover(self): chaos_event_log.info( m(f"resuming a service on {self.node.node_id}").with_time()) self.node.continue_service() self.workload.availability_logger.log_recovery( f"a service on {self.node.node_id} resumed") chaos_event_log.info( m(f"a service on {self.node.node_id} resumed").with_time())
def recover(self): chaos_event_log.info( m(f"stopping strobbing on {self.node.node_id}").with_time()) self.node.strobe_recover() chaos_event_log.info( m(f"stopped strobbing on {self.node.node_id}").with_time()) self.workload.availability_logger.log_recovery( f"stopped strobbing on {self.node.node_id}")
def recover(self): peers = ", ".join(self.peers) chaos_event_log.info( m(f"rejoining node {self.node.node_id} to {peers}").with_time()) self.node.rejoin(self.ips) self.workload.availability_logger.log_recovery( f"node {self.node.node_id} rejoined to {peers}") chaos_event_log.info( m(f"node {self.node.node_id} rejoined to {peers}").with_time())
def _start_service(self): for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"starting kvelldb on {node_id}").with_time()) node.start_service() for node_id in self.nodes: node = self.nodes[node_id] if not node.is_service_running(): chaos_event_log.info( m(f"kvelldb isn't running on {node_id}").with_time()) raise Exception(f"kvelldb on {node_id} isn't running")
def inject(self, cluster, workload): self.workload = workload self.node = self.node_selector(cluster) if self.node == None: chaos_event_log.info(m("can't select a node").with_time()) raise Exception("can't select a node") chaos_event_log.info( m(f"injecting 10ms disk delay on {self.node.node_id} ({self.scope})" ).with_time()) self.workload.availability_logger.log_fault( f"injecting 10ms disk delay on {self.node.node_id} ({self.scope})") self.node.io_delay(10) chaos_event_log.info( m(f"10ms disk delay on {self.node.node_id} injected").with_time())
def inject(self, cluster, workload): self.workload = workload self.node = self.node_selector(cluster) if self.node == None: chaos_event_log.info(m("can't select a node").with_time()) raise Exception("can't select a node") chaos_event_log.info( m(f"suspending a service on {self.node.node_id} ({self.scope})"). with_time()) self.workload.availability_logger.log_fault( f"suspending a service on {self.node.node_id} ({self.scope})") self.node.pause_service() chaos_event_log.info( m(f"a service on {self.node.node_id} suspended").with_time())
def init(self, write_id, version, value): self.head = AcceptedWrite(0, write_id, version, value) cmdlog.info( m(type="linearization_point", write_id=write_id, value=value).with_time()) self.history_by_idx[self.head.idx] = self.head self.history_by_write_id[self.head.write_id] = self.head
async def run(config_json, n, overrides): suite_id = int(time.time()) with open(config_json, "r") as settings_json: config = json.load(settings_json) init_output(config, suite_id) shutil.copyfile( config_json, path.join(config["output"], str(suite_id), "settings.json")) if overrides: for overide in overrides: [key, value] = overide.split("=", 1) config[key] = json.loads(value) faults = {fault: known_faults[fault] for fault in config["faults"]} with RedpandaCluster(config) as cluster: try: for _ in range(0, n): if not config["reset_before_test"]: await cluster.restart() if not await cluster.is_ok(): chaos_event_log.info( m(f"cluster isn't healthy").with_time()) raise Exception(f"cluster isn't healthy") await inject_recover_scenarios_aio( suite_id, config, cluster, faults, lambda: workload_factory(config)) except ViolationInducedExit: pass
async def wait(self, period_ms): while self.active: await asyncio.sleep(float(period_ms) / 1000) if self.has_error: msg = m("error on fault injection / recovery", type=self.error_type, value=self.error_value, stacktrace=self.error_stacktrace).with_time() chaos_event_log.info(msg) raise Exception(str(msg))
def inject(self, cluster, workload): self.workload = workload self.node = self.node_selector(cluster) if self.node == None: chaos_event_log.info(m("can't select a node").with_time()) raise Exception("can't select a node") for node_id in cluster.nodes.keys(): if node_id != self.node.node_id: self.ips.append(cluster.nodes[node_id].ip) self.peers.append(node_id) peers = ", ".join(self.peers) chaos_event_log.info( m(f"isolating node {self.node.node_id} ({self.scope}) from {peers}" ).with_time()) self.workload.availability_logger.log_fault( f"isolating node {self.node.node_id} ({self.scope}) from {peers}") self.node.isolate(self.ips) chaos_event_log.info( m(f"node {self.node.node_id} isolated from {peers}").with_time())
def _start_service(self): for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"starting redpanda on {node_id}").with_time()) node.start_service() attempts = 2 while True: attempts -= 1 time.sleep(5) running = True for node_id in self.nodes: node = self.nodes[node_id] if not node.is_service_running(): running = False chaos_event_log.info( m(f"redpanda on {node_id} isn't running").with_time()) if attempts < 0: raise Exception(f"redpanda on {node_id} isn't running") if running: break
def teardown(self): self._kill_api() self._strobe_api_kill() self._rm_api_log() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"terminating kafka on {node_id}").with_time()) node.kill_kafka() chaos_event_log.info( m(f"terminating zookeeper on {node_id}").with_time()) node.kill_zookeeper() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info( m(f"umount data dir on {node_id}").with_time()) node.umount() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info(m(f"removing data on {node_id}").with_time()) node.wipe_out()
def continue_service(self): try: self.node.account.ssh_output( "ps aux | egrep [re]dpanda/bin | awk '{print $2}' | xargs -r kill -CONT", allow_fail=False) except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() errors_log.info( m("Failed to resume redpanda", error_type=str(e), error_value=str(v), stacktrace=stacktrace)) raise
async def cas_aio(self, key, prev_write_id, value, write_id): data = None try: resp = await self.session.post(f"http://{self.address}/cas", data=json.dumps({ "key": key, "prevWriteID": prev_write_id, "value": value, "writeID": write_id })) if resp.status == 200: data = await resp.read() else: raise RequestTimedout() except aiohttp.client_exceptions.ServerDisconnectedError: raise RequestTimedout() except aiohttp.client_exceptions.ClientConnectorError: raise RequestTimedout() except aiohttp.client_exceptions.ClientOSError: raise RequestTimedout() except ConnectionResetError: raise RequestTimedout() except asyncio.TimeoutError: raise RequestTimedout() except: e, v = sys.exc_info()[:2] cmdlog.info( m("unexpected kv/cas error", type="error", error_type=str(e), error_value=str(v), stacktrace=traceback.format_exc()).with_time()) raise RequestTimedout() data = json.loads(data) record = None if data["status"] == "ok": if data["hasData"]: record = Record(data["writeID"], data["value"]) elif data["status"] == "unknown": raise RequestTimedout() elif data["status"] == "fail": raise RequestCanceled() elif data["status"] == "violation": raise RequestViolated(data["info"]) else: raise Exception(f"Unknown status: {data['status']}") return Response(record, data["metrics"])
def inject(self, cluster, workload): try: self.workload = workload self.node = self.node_selector(cluster) if self.node == None: chaos_event_log.info(m("can't select a node").with_time()) raise Exception("can't select a node") chaos_event_log.info( m(f"starting strobing on {self.node.node_id} ({self.scope})"). with_time()) self.workload.availability_logger.log_fault( f"starting strobing on {self.node.node_id} ({self.scope})") self.node.strobe_inject() chaos_event_log.info( m(f"strobbing on {self.node.node_id}").with_time()) except: e, v = sys.exc_info()[:2] chaos_event_log.info( m("can't inject strobe", error_type=str(e), error_value=str(v), stacktrace=traceback.format_exc()).with_time()) raise
def rejoin(self, ips): cmd = [] for ip in ips: cmd.append(f"sudo iptables -D INPUT -s {ip} -j DROP") cmd.append(f"sudo iptables -D OUTPUT -d {ip} -j DROP") cmd = " && ".join(cmd) try: self.node.account.ssh_output(cmd, allow_fail=False) except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() errors_log.info( m("Failed to recover io", error_type=str(e), error_value=str(v), stacktrace=stacktrace)) raise
def gc(self): midx = self.head.idx for key in self.reads: midx = min(self.reads[key], midx) for idx in list(filter(lambda x: x < midx, self.history_by_idx.keys())): write = self.history_by_idx[idx] del self.history_by_write_id[write.write_id] del self.history_by_idx[idx] for key in list(self.pending_writes.keys()): if self.pending_writes[key].version < self.head.version: cmdlog.info( m(type="gc", head=self.head.write_id, garbage=key).with_time()) # eventually a client initiated a garbage collected request # observes a timeout or an error invoke write_canceled or # write_timeouted and clean self.applied and self.gced self.gced[key] = True del self.pending_writes[key]
def io_ruin(self): result = None try: result = self.node.account.ssh_output( f"curl -s 127.0.0.1:{MountMuService.IOFAULT_PORT}/ruin", allow_fail=False) result = json.loads(result) if result["status"] != "ok": raise Exception("Failed to ruin io: expected status=ok got: " + json.dumps(result)) except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() errors_log.info( m("Failed to ruin io", error_type=str(e), error_value=str(v), stacktrace=stacktrace)) raise
async def get_aio(self, key, read_id): data = None try: resp = await self.session.get( f"http://{self.address}/read?key={key}&read_id={read_id}") if resp.status == 200: data = await resp.read() else: raise RequestTimedout() except aiohttp.client_exceptions.ServerDisconnectedError: raise RequestTimedout() except aiohttp.client_exceptions.ClientConnectorError: raise RequestTimedout() except aiohttp.client_exceptions.ClientOSError: raise RequestTimedout() except ConnectionResetError: raise RequestTimedout() except asyncio.TimeoutError: raise RequestTimedout() except: e, v = sys.exc_info()[:2] cmdlog.info( m("unexpected kv/get error", type="error", error_type=str(e), error_value=str(v), stacktrace=traceback.format_exc()).with_time()) raise RequestTimedout() data = json.loads(data) record = None if data["status"] == "ok": if data["hasData"]: record = Record(data["writeID"], data["value"]) elif data["status"] == "unknown": raise RequestTimedout() elif data["status"] == "fail": raise RequestCanceled() else: raise Exception(f"Unknown status: {data['status']}") return Response(record, data["metrics"])
async def is_ok(self): is_ok = False for endpoint in self.config["endpoints"]: host = endpoint["host"] port = endpoint["httpport"] address = f"{host}:{port}" kv = KVNode(address, address) try: await kv.put_aio("test", "value1", "wid1") is_ok = True except RequestTimedout: chaos_event_log.info( m(f"put request to {address} timed out").with_time()) pass except RequestCanceled: pass await kv.close_aio() if is_ok: return True return is_ok
def __init__(self, service, redpanda_mu, node): self.service = service self.node = node self.node_id = node.account.hostname self.redpanda_mu = redpanda_mu try: ip = self.node.account.ssh_output("getent hosts " + self.node_id + " | awk '{ printf $1 }'", allow_fail=False) self.ip = ip.decode("utf-8") except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() errors_log.info( m(f"Failed to resolve {self.node_id} to ip", error_type=str(e), error_value=str(v), stacktrace=stacktrace)) raise
def observe(self, write_id): write = self.pending_writes[write_id] chain = [] while True: if write.prev_write_id == self.head.write_id: chain.append(write) if self.head.version >= write.version: raise Violation( " -> ".join(map(idstr, chain)) + " -> " + write.prev_write_id + " doesn't lead to the latest observed state: " + idstr(self.head)) for w in reversed(chain): self.applied[w.write_id] = True del self.pending_writes[w.write_id] self.head = AcceptedWrite(self.head.idx + 1, w.write_id, w.version, w.value) self.history_by_idx[self.head.idx] = self.head self.history_by_write_id[self.head.write_id] = self.head cmdlog.info( m(type="linearization_point", write_id=w.write_id, value=w.value).with_time()) break elif write.prev_write_id in self.pending_writes: chain.append(write) if self.pending_writes[ write.prev_write_id].version >= write.version: raise Violation( " -> ".join(map(idstr, chain)) + " -> " + write.prev_write_id + " doesn't lead to the pending state: " + idstr(self.pending_writes[write.prev_write_id])) write = self.pending_writes[write.prev_write_id] else: chain.append(write) raise Violation( " -> ".join(map(idstr, chain)) + " -> " + write.prev_write_id + " doesn't lead to the latest observed state: " + idstr(self.head))
def inject(self, cluster, workload): self.workload = workload self.node = self.node_selector(cluster) if self.node == None: chaos_event_log.info(m("can't select a node").with_time()) raise Exception("can't select a node") try: chaos_event_log.info( m(f"terminating a service on {self.node.node_id} ({self.scope})" ).with_time()) self.workload.availability_logger.log_fault( f"terminating a service on {self.node.node_id} ({self.scope})") self.node.kill() chaos_event_log.info( m(f"a service on {self.node.node_id} terminated").with_time()) except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() chaos_event_log.info( m("error on terminating a service", error_type=str(e), error_value=str(v), stacktrace=stacktrace).with_time()) raise is_running = True for _ in range(0, 3): time.sleep(1) try: is_running = self.node.is_service_running() except: e, v = sys.exc_info()[:2] stacktrace = traceback.format_exc() chaos_event_log.info( m("error on checking status of a service", error_type=str(e), error_value=str(v), stacktrace=stacktrace).with_time()) raise if is_running: chaos_event_log.info( m(f"can't terminate a service on {self.node.node_id}"). with_time()) raise Exception( f"can't terminate a service on {self.node.node_id}")
async def restart(self): chaos_stdout.info("(re)starting a cluster") self.teardown() self._mount() for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info(m(f"preparing dirs {node_id}").with_time()) node.prep_dirs() for node_id in self.nodes: node = self.nodes[node_id] node.write_zookeeper_configs() node.write_kafka_config() for node_id in self.nodes: node = self.nodes[node_id] node.start_zookeeper() for node_id in self.nodes: node = self.nodes[node_id] node.start_kafka() cluster_warmup = self.config["cluster_warmup"] await asyncio.sleep(cluster_warmup) chaos_stdout.info("cluster started") chaos_stdout.info("creating topic") node = self.any_node() node.create_topic() chaos_stdout.info("topic created") self._start_api() self._strobe_api_start() self._strobe_recover() # TODO: Replace sleep with an explicit check waiting for kafkakv & strobe # services to start time.sleep(2) chaos_stdout.info("")
async def run(config, n, overrides): init_output(config) if overrides: for overide in overrides: [key, value] = overide.split("=", 1) config[key] = json.loads(value) faults = {fault: known_faults[fault] for fault in config["faults"]} with RedpandaCluster(config) as cluster: try: for _ in range(0, n): if not config["reset_before_test"]: await cluster.restart() if not await cluster.is_ok(): chaos_event_log.info( m(f"cluster isn't healthy").with_time()) raise Exception(f"cluster isn't healthy") await inject_recover_scenarios_aio( config, cluster, faults, lambda: workload_factory(config)) except ViolationInducedExit: pass
def _mount(self): for node_id in self.nodes: node = self.nodes[node_id] chaos_event_log.info(m(f"mount data dir on {node_id}").with_time()) node.mount()