示例#1
0
    def teardown(self):
        self._strobe_api_kill()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"terminating kvelldb on {node_id}").with_time())
            node.kill()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            if node.is_service_running():
                chaos_event_log.info(
                    m(f"kvelldb on {node_id} is still running").with_time())
                raise Exception(f"kvelldb on {node_id} is still running")

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"umount data dir on {node_id}").with_time())
            node.umount()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(m(f"removing data on {node_id}").with_time())
            node.wipe_out()
示例#2
0
文件: faults.py 项目: krambn/redpanda
    def recover(self):
        chaos_event_log.info(
            m(f"removing disk fault injection & restarting a service on {self.node.node_id}"
              ).with_time())
        self.node.io_recover()
        self.node.kill()
        self.node.start_service()

        attempts = 4
        while True:
            attempts -= 1
            time.sleep(5)
            if not self.node.is_service_running():
                chaos_event_log.info(
                    m(f"a service on {self.node.node_id} isn't running").
                    with_time())
                if attempts < 0:
                    raise Exception(
                        f"can't start a service on {self.node.node_id}")
                else:
                    continue
            break

        self.workload.availability_logger.log_recovery(
            f"disk fault injection removed & a service on {self.node.node_id} restarted"
        )
        chaos_event_log.info(
            m(f"disk fault injection removed & a service on {self.node.node_id} restarted"
              ).with_time())
    def teardown(self):
        self._kill_api()
        self._strobe_api_kill()
        self._rm_api_log()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"terminating redpanda on {node_id}").with_time())
            node.kill()

        # TODO: add several checks (kill -9 may be async)
        for node_id in self.nodes:
            node = self.nodes[node_id]
            if node.is_service_running():
                chaos_event_log.info(
                    m(f"redpanda on {node_id} is still running").with_time())
                raise Exception(f"redpanda on {node_id} is still running")

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"umount data dir on {node_id}").with_time())
            node.umount()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(m(f"removing data on {node_id}").with_time())
            node.wipe_out()
示例#4
0
文件: faults.py 项目: krambn/redpanda
 def recover(self):
     chaos_event_log.info(
         m(f"removing disk delay on {self.node.node_id}").with_time())
     self.node.io_recover()
     self.workload.availability_logger.log_recovery(
         f"disk delay on {self.node.node_id} removed")
     chaos_event_log.info(
         m(f"disk delay on {self.node.node_id} removed").with_time())
示例#5
0
文件: faults.py 项目: krambn/redpanda
 def recover(self):
     chaos_event_log.info(
         m(f"resuming a service on {self.node.node_id}").with_time())
     self.node.continue_service()
     self.workload.availability_logger.log_recovery(
         f"a service on {self.node.node_id} resumed")
     chaos_event_log.info(
         m(f"a service on {self.node.node_id} resumed").with_time())
示例#6
0
文件: faults.py 项目: krambn/redpanda
 def recover(self):
     chaos_event_log.info(
         m(f"stopping strobbing on {self.node.node_id}").with_time())
     self.node.strobe_recover()
     chaos_event_log.info(
         m(f"stopped strobbing on {self.node.node_id}").with_time())
     self.workload.availability_logger.log_recovery(
         f"stopped strobbing on {self.node.node_id}")
示例#7
0
文件: faults.py 项目: krambn/redpanda
 def recover(self):
     peers = ", ".join(self.peers)
     chaos_event_log.info(
         m(f"rejoining node {self.node.node_id} to {peers}").with_time())
     self.node.rejoin(self.ips)
     self.workload.availability_logger.log_recovery(
         f"node {self.node.node_id} rejoined to {peers}")
     chaos_event_log.info(
         m(f"node {self.node.node_id} rejoined to {peers}").with_time())
示例#8
0
    def _start_service(self):
        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"starting kvelldb on {node_id}").with_time())
            node.start_service()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            if not node.is_service_running():
                chaos_event_log.info(
                    m(f"kvelldb isn't running on {node_id}").with_time())
                raise Exception(f"kvelldb on {node_id} isn't running")
示例#9
0
文件: faults.py 项目: krambn/redpanda
    def inject(self, cluster, workload):
        self.workload = workload
        self.node = self.node_selector(cluster)
        if self.node == None:
            chaos_event_log.info(m("can't select a node").with_time())
            raise Exception("can't select a node")

        chaos_event_log.info(
            m(f"injecting 10ms disk delay on {self.node.node_id} ({self.scope})"
              ).with_time())
        self.workload.availability_logger.log_fault(
            f"injecting 10ms disk delay on {self.node.node_id} ({self.scope})")
        self.node.io_delay(10)
        chaos_event_log.info(
            m(f"10ms disk delay on {self.node.node_id} injected").with_time())
示例#10
0
文件: faults.py 项目: krambn/redpanda
    def inject(self, cluster, workload):
        self.workload = workload
        self.node = self.node_selector(cluster)
        if self.node == None:
            chaos_event_log.info(m("can't select a node").with_time())
            raise Exception("can't select a node")

        chaos_event_log.info(
            m(f"suspending a service on {self.node.node_id} ({self.scope})").
            with_time())
        self.workload.availability_logger.log_fault(
            f"suspending a service on {self.node.node_id} ({self.scope})")
        self.node.pause_service()
        chaos_event_log.info(
            m(f"a service on {self.node.node_id} suspended").with_time())
示例#11
0
 def init(self, write_id, version, value):
     self.head = AcceptedWrite(0, write_id, version, value)
     cmdlog.info(
         m(type="linearization_point", write_id=write_id,
           value=value).with_time())
     self.history_by_idx[self.head.idx] = self.head
     self.history_by_write_id[self.head.write_id] = self.head
示例#12
0
async def run(config_json, n, overrides):
    suite_id = int(time.time())

    with open(config_json, "r") as settings_json:
        config = json.load(settings_json)

    init_output(config, suite_id)

    shutil.copyfile(
        config_json, path.join(config["output"], str(suite_id),
                               "settings.json"))

    if overrides:
        for overide in overrides:
            [key, value] = overide.split("=", 1)
            config[key] = json.loads(value)

    faults = {fault: known_faults[fault] for fault in config["faults"]}

    with RedpandaCluster(config) as cluster:
        try:
            for _ in range(0, n):
                if not config["reset_before_test"]:
                    await cluster.restart()
                    if not await cluster.is_ok():
                        chaos_event_log.info(
                            m(f"cluster isn't healthy").with_time())
                        raise Exception(f"cluster isn't healthy")
                await inject_recover_scenarios_aio(
                    suite_id, config, cluster, faults,
                    lambda: workload_factory(config))
        except ViolationInducedExit:
            pass
示例#13
0
    async def wait(self, period_ms):
        while self.active:
            await asyncio.sleep(float(period_ms) / 1000)

        if self.has_error:
            msg = m("error on fault injection / recovery",
                    type=self.error_type,
                    value=self.error_value,
                    stacktrace=self.error_stacktrace).with_time()
            chaos_event_log.info(msg)
            raise Exception(str(msg))
示例#14
0
文件: faults.py 项目: krambn/redpanda
    def inject(self, cluster, workload):
        self.workload = workload
        self.node = self.node_selector(cluster)
        if self.node == None:
            chaos_event_log.info(m("can't select a node").with_time())
            raise Exception("can't select a node")

        for node_id in cluster.nodes.keys():
            if node_id != self.node.node_id:
                self.ips.append(cluster.nodes[node_id].ip)
                self.peers.append(node_id)

        peers = ", ".join(self.peers)
        chaos_event_log.info(
            m(f"isolating node {self.node.node_id} ({self.scope}) from {peers}"
              ).with_time())
        self.workload.availability_logger.log_fault(
            f"isolating node {self.node.node_id} ({self.scope}) from {peers}")
        self.node.isolate(self.ips)
        chaos_event_log.info(
            m(f"node {self.node.node_id} isolated from {peers}").with_time())
示例#15
0
    def _start_service(self):
        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"starting redpanda on {node_id}").with_time())
            node.start_service()

        attempts = 2
        while True:
            attempts -= 1
            time.sleep(5)
            running = True
            for node_id in self.nodes:
                node = self.nodes[node_id]
                if not node.is_service_running():
                    running = False
                    chaos_event_log.info(
                        m(f"redpanda on {node_id} isn't running").with_time())
                    if attempts < 0:
                        raise Exception(f"redpanda on {node_id} isn't running")
            if running:
                break
示例#16
0
    def teardown(self):
        self._kill_api()
        self._strobe_api_kill()
        self._rm_api_log()
        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"terminating kafka on {node_id}").with_time())
            node.kill_kafka()
            chaos_event_log.info(
                m(f"terminating zookeeper on {node_id}").with_time())
            node.kill_zookeeper()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(
                m(f"umount data dir on {node_id}").with_time())
            node.umount()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(m(f"removing data on {node_id}").with_time())
            node.wipe_out()
示例#17
0
 def continue_service(self):
     try:
         self.node.account.ssh_output(
             "ps aux | egrep [re]dpanda/bin | awk '{print $2}' | xargs -r kill -CONT",
             allow_fail=False)
     except:
         e, v = sys.exc_info()[:2]
         stacktrace = traceback.format_exc()
         errors_log.info(
             m("Failed to resume redpanda",
               error_type=str(e),
               error_value=str(v),
               stacktrace=stacktrace))
         raise
示例#18
0
    async def cas_aio(self, key, prev_write_id, value, write_id):
        data = None
        try:
            resp = await self.session.post(f"http://{self.address}/cas",
                                           data=json.dumps({
                                               "key": key,
                                               "prevWriteID": prev_write_id,
                                               "value": value,
                                               "writeID": write_id
                                           }))
            if resp.status == 200:
                data = await resp.read()
            else:
                raise RequestTimedout()
        except aiohttp.client_exceptions.ServerDisconnectedError:
            raise RequestTimedout()
        except aiohttp.client_exceptions.ClientConnectorError:
            raise RequestTimedout()
        except aiohttp.client_exceptions.ClientOSError:
            raise RequestTimedout()
        except ConnectionResetError:
            raise RequestTimedout()
        except asyncio.TimeoutError:
            raise RequestTimedout()
        except:
            e, v = sys.exc_info()[:2]

            cmdlog.info(
                m("unexpected kv/cas error",
                  type="error",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=traceback.format_exc()).with_time())

            raise RequestTimedout()
        data = json.loads(data)
        record = None
        if data["status"] == "ok":
            if data["hasData"]:
                record = Record(data["writeID"], data["value"])
        elif data["status"] == "unknown":
            raise RequestTimedout()
        elif data["status"] == "fail":
            raise RequestCanceled()
        elif data["status"] == "violation":
            raise RequestViolated(data["info"])
        else:
            raise Exception(f"Unknown status: {data['status']}")
        return Response(record, data["metrics"])
示例#19
0
文件: faults.py 项目: krambn/redpanda
    def inject(self, cluster, workload):
        try:
            self.workload = workload
            self.node = self.node_selector(cluster)
            if self.node == None:
                chaos_event_log.info(m("can't select a node").with_time())
                raise Exception("can't select a node")

            chaos_event_log.info(
                m(f"starting strobing on {self.node.node_id} ({self.scope})").
                with_time())
            self.workload.availability_logger.log_fault(
                f"starting strobing on {self.node.node_id} ({self.scope})")
            self.node.strobe_inject()
            chaos_event_log.info(
                m(f"strobbing on {self.node.node_id}").with_time())
        except:
            e, v = sys.exc_info()[:2]
            chaos_event_log.info(
                m("can't inject strobe",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=traceback.format_exc()).with_time())
            raise
示例#20
0
    def rejoin(self, ips):
        cmd = []
        for ip in ips:
            cmd.append(f"sudo iptables -D INPUT -s {ip} -j DROP")
            cmd.append(f"sudo iptables -D OUTPUT -d {ip} -j DROP")
        cmd = " && ".join(cmd)

        try:
            self.node.account.ssh_output(cmd, allow_fail=False)
        except:
            e, v = sys.exc_info()[:2]
            stacktrace = traceback.format_exc()
            errors_log.info(
                m("Failed to recover io",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=stacktrace))
            raise
示例#21
0
 def gc(self):
     midx = self.head.idx
     for key in self.reads:
         midx = min(self.reads[key], midx)
     for idx in list(filter(lambda x: x < midx,
                            self.history_by_idx.keys())):
         write = self.history_by_idx[idx]
         del self.history_by_write_id[write.write_id]
         del self.history_by_idx[idx]
     for key in list(self.pending_writes.keys()):
         if self.pending_writes[key].version < self.head.version:
             cmdlog.info(
                 m(type="gc", head=self.head.write_id,
                   garbage=key).with_time())
             # eventually a client initiated a garbage collected request
             # observes a timeout or an error invoke write_canceled or
             # write_timeouted and clean self.applied and self.gced
             self.gced[key] = True
             del self.pending_writes[key]
示例#22
0
 def io_ruin(self):
     result = None
     try:
         result = self.node.account.ssh_output(
             f"curl -s 127.0.0.1:{MountMuService.IOFAULT_PORT}/ruin",
             allow_fail=False)
         result = json.loads(result)
         if result["status"] != "ok":
             raise Exception("Failed to ruin io: expected status=ok got: " +
                             json.dumps(result))
     except:
         e, v = sys.exc_info()[:2]
         stacktrace = traceback.format_exc()
         errors_log.info(
             m("Failed to ruin io",
               error_type=str(e),
               error_value=str(v),
               stacktrace=stacktrace))
         raise
示例#23
0
    async def get_aio(self, key, read_id):
        data = None
        try:
            resp = await self.session.get(
                f"http://{self.address}/read?key={key}&read_id={read_id}")
            if resp.status == 200:
                data = await resp.read()
            else:
                raise RequestTimedout()
        except aiohttp.client_exceptions.ServerDisconnectedError:
            raise RequestTimedout()
        except aiohttp.client_exceptions.ClientConnectorError:
            raise RequestTimedout()
        except aiohttp.client_exceptions.ClientOSError:
            raise RequestTimedout()
        except ConnectionResetError:
            raise RequestTimedout()
        except asyncio.TimeoutError:
            raise RequestTimedout()
        except:
            e, v = sys.exc_info()[:2]

            cmdlog.info(
                m("unexpected kv/get error",
                  type="error",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=traceback.format_exc()).with_time())

            raise RequestTimedout()
        data = json.loads(data)
        record = None
        if data["status"] == "ok":
            if data["hasData"]:
                record = Record(data["writeID"], data["value"])
        elif data["status"] == "unknown":
            raise RequestTimedout()
        elif data["status"] == "fail":
            raise RequestCanceled()
        else:
            raise Exception(f"Unknown status: {data['status']}")
        return Response(record, data["metrics"])
示例#24
0
 async def is_ok(self):
     is_ok = False
     for endpoint in self.config["endpoints"]:
         host = endpoint["host"]
         port = endpoint["httpport"]
         address = f"{host}:{port}"
         kv = KVNode(address, address)
         try:
             await kv.put_aio("test", "value1", "wid1")
             is_ok = True
         except RequestTimedout:
             chaos_event_log.info(
                 m(f"put request to {address} timed out").with_time())
             pass
         except RequestCanceled:
             pass
         await kv.close_aio()
         if is_ok:
             return True
     return is_ok
示例#25
0
    def __init__(self, service, redpanda_mu, node):
        self.service = service
        self.node = node
        self.node_id = node.account.hostname
        self.redpanda_mu = redpanda_mu

        try:
            ip = self.node.account.ssh_output("getent hosts " + self.node_id +
                                              " | awk '{ printf $1 }'",
                                              allow_fail=False)
            self.ip = ip.decode("utf-8")
        except:
            e, v = sys.exc_info()[:2]
            stacktrace = traceback.format_exc()
            errors_log.info(
                m(f"Failed to resolve {self.node_id} to ip",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=stacktrace))
            raise
示例#26
0
 def observe(self, write_id):
     write = self.pending_writes[write_id]
     chain = []
     while True:
         if write.prev_write_id == self.head.write_id:
             chain.append(write)
             if self.head.version >= write.version:
                 raise Violation(
                     " -> ".join(map(idstr, chain)) + " -> " +
                     write.prev_write_id +
                     " doesn't lead to the latest observed state: " +
                     idstr(self.head))
             for w in reversed(chain):
                 self.applied[w.write_id] = True
                 del self.pending_writes[w.write_id]
                 self.head = AcceptedWrite(self.head.idx + 1, w.write_id,
                                           w.version, w.value)
                 self.history_by_idx[self.head.idx] = self.head
                 self.history_by_write_id[self.head.write_id] = self.head
                 cmdlog.info(
                     m(type="linearization_point",
                       write_id=w.write_id,
                       value=w.value).with_time())
             break
         elif write.prev_write_id in self.pending_writes:
             chain.append(write)
             if self.pending_writes[
                     write.prev_write_id].version >= write.version:
                 raise Violation(
                     " -> ".join(map(idstr, chain)) + " -> " +
                     write.prev_write_id +
                     " doesn't lead to the pending state: " +
                     idstr(self.pending_writes[write.prev_write_id]))
             write = self.pending_writes[write.prev_write_id]
         else:
             chain.append(write)
             raise Violation(
                 " -> ".join(map(idstr, chain)) + " -> " +
                 write.prev_write_id +
                 " doesn't lead to the latest observed state: " +
                 idstr(self.head))
示例#27
0
文件: faults.py 项目: krambn/redpanda
    def inject(self, cluster, workload):
        self.workload = workload
        self.node = self.node_selector(cluster)
        if self.node == None:
            chaos_event_log.info(m("can't select a node").with_time())
            raise Exception("can't select a node")

        try:
            chaos_event_log.info(
                m(f"terminating a service on {self.node.node_id} ({self.scope})"
                  ).with_time())
            self.workload.availability_logger.log_fault(
                f"terminating a service on {self.node.node_id} ({self.scope})")
            self.node.kill()
            chaos_event_log.info(
                m(f"a service on {self.node.node_id} terminated").with_time())
        except:
            e, v = sys.exc_info()[:2]
            stacktrace = traceback.format_exc()
            chaos_event_log.info(
                m("error on terminating a service",
                  error_type=str(e),
                  error_value=str(v),
                  stacktrace=stacktrace).with_time())
            raise

        is_running = True

        for _ in range(0, 3):
            time.sleep(1)
            try:
                is_running = self.node.is_service_running()
            except:
                e, v = sys.exc_info()[:2]
                stacktrace = traceback.format_exc()
                chaos_event_log.info(
                    m("error on checking status of a service",
                      error_type=str(e),
                      error_value=str(v),
                      stacktrace=stacktrace).with_time())
                raise

        if is_running:
            chaos_event_log.info(
                m(f"can't terminate a service on {self.node.node_id}").
                with_time())
            raise Exception(
                f"can't terminate a service on {self.node.node_id}")
示例#28
0
    async def restart(self):
        chaos_stdout.info("(re)starting a cluster")
        self.teardown()

        self._mount()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            chaos_event_log.info(m(f"preparing dirs {node_id}").with_time())
            node.prep_dirs()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            node.write_zookeeper_configs()
            node.write_kafka_config()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            node.start_zookeeper()

        for node_id in self.nodes:
            node = self.nodes[node_id]
            node.start_kafka()

        cluster_warmup = self.config["cluster_warmup"]
        await asyncio.sleep(cluster_warmup)
        chaos_stdout.info("cluster started")
        chaos_stdout.info("creating topic")
        node = self.any_node()
        node.create_topic()
        chaos_stdout.info("topic created")
        self._start_api()
        self._strobe_api_start()
        self._strobe_recover()
        # TODO: Replace sleep with an explicit check waiting for kafkakv & strobe
        # services to start
        time.sleep(2)

        chaos_stdout.info("")
示例#29
0
async def run(config, n, overrides):
    init_output(config)

    if overrides:
        for overide in overrides:
            [key, value] = overide.split("=", 1)
            config[key] = json.loads(value)

    faults = {fault: known_faults[fault] for fault in config["faults"]}

    with RedpandaCluster(config) as cluster:
        try:
            for _ in range(0, n):
                if not config["reset_before_test"]:
                    await cluster.restart()
                    if not await cluster.is_ok():
                        chaos_event_log.info(
                            m(f"cluster isn't healthy").with_time())
                        raise Exception(f"cluster isn't healthy")
                await inject_recover_scenarios_aio(
                    config, cluster, faults, lambda: workload_factory(config))
        except ViolationInducedExit:
            pass
示例#30
0
 def _mount(self):
     for node_id in self.nodes:
         node = self.nodes[node_id]
         chaos_event_log.info(m(f"mount data dir on {node_id}").with_time())
         node.mount()