示例#1
0
    def test_memory_stress_replicas_group_load_balance(self, prepare_collection):
        """
        target: test apply memory stress on replicas and load balance inside group
        method: 1.Deploy milvus and limit querynode memory 6Gi
                2.Insret 1000,000 entities (500Mb), load 2 replicas (memory usage 1.5Gb)
                3.Apply memory stress 4Gi on querynode
        expected: Verify that load balancing occurs
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        release_name = "mic-memory"

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"

        # get the replica and random chaos querynode
        replicas, _ = collection_w.get_replicas()
        chaos_querynode_id = replicas.groups[0].group_nodes[0]
        label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode"
        querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label)
        chaos_querynode_pod = querynode_id_pod_pair[chaos_querynode_id]

        # get the segment num before chaos
        seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_before = cf.get_segment_distribution(seg_info_before)
        segments_num_before = len(seg_distribution_before[chaos_querynode_id]["sealed"])
        log.debug(segments_num_before)
        log.debug(seg_distribution_before[chaos_querynode_id]["sealed"])

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml")
        chaos_config['spec']['selector']['pods']['chaos-testing'] = [chaos_querynode_pod]
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug(f"Apply memory stress on querynode {chaos_querynode_id}, pod {chaos_querynode_pod}")

        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        sleep(eval(duration))

        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # Verfiy auto load loadbalance
        seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_after = cf.get_segment_distribution(seg_info_after)
        segments_num_after = len(seg_distribution_after[chaos_querynode_id]["sealed"])
        log.debug(segments_num_after)
        log.debug(seg_distribution_after[chaos_querynode_id]["sealed"])

        assert segments_num_after < segments_num_before
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
示例#2
0
    def parser_testcase_config(self, chaos_yaml, chaos_config):
        cluster_nodes = check_cluster_nodes(chaos_config)
        tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
        tests_config = cc.gen_experiment_config(tests_yaml)
        test_collections = tests_config.get('Collections', None)
        for t in test_collections:
            test_chaos = t.get('testcase', {}).get('chaos', {})
            if test_chaos in chaos_yaml:
                expects = t.get('testcase',
                                {}).get('expectation',
                                        {}).get('cluster_1_node', {})
                # for the cluster_n_node
                if cluster_nodes > 1:
                    expects = t.get('testcase',
                                    {}).get('expectation',
                                            {}).get('cluster_n_node', {})
                log.info(f"yaml.expects: {expects}")
                self.expect_create = expects.get(Op.create.value,
                                                 constants.SUCC)
                self.expect_insert = expects.get(Op.insert.value,
                                                 constants.SUCC)
                self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
                self.expect_index = expects.get(Op.index.value, constants.SUCC)
                self.expect_search = expects.get(Op.search.value,
                                                 constants.SUCC)
                self.expect_query = expects.get(Op.query.value, constants.SUCC)
                log.info(
                    f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
                    f"flush:{self.expect_flush}, index:{self.expect_index}, "
                    f"search:{self.expect_search}, query:{self.expect_query}")
                return True

        return False
示例#3
0
    def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml):
        """
        target: test inject memory stress into indexnode
        method: 1.Deploy milvus and limit indexnode memory resource 3 / 4Gi
                2.Create collection and insert some data
                3.Inject memory stress chaos 512Mi
                4.Create index
        expected:
        """
        # init collection and insert
        nb = 256000  # vector size: 512*4*nb about 512Mi and create index need 2.8Gi memory
        dim = 512
        # c_name = cf.gen_unique_str('chaos_memory')
        c_name = 'chaos_memory_gKs8aSUu'
        index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 128}}

        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(name=c_name,
                                     schema=cf.gen_default_collection_schema(dim=dim), shards_num=1)

        # insert 256000 512 dim entities, size 512Mi
        for i in range(2):
            t0_insert = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb // 2
            # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}')
            tt_insert = datetime.datetime.now() - t0_insert
            log.info(f"{i} insert data cost: {tt_insert}")

        # flush
        t0_flush = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt_flush = datetime.datetime.now() - t0_flush
        log.info(f'flush {nb * 10} entities cost: {tt_flush}')

        log.info(collection_w.indexes[0].params)
        if collection_w.has_index()[0]:
            collection_w.drop_index()

        # indexNode start build index, inject chaos memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("inject chaos")

        # create index
        t0_index = datetime.datetime.now()
        index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name,
                                             index_params=index_params)
        tt_index = datetime.datetime.now() - t0_index

        log.info(f"create index cost: {tt_index}")
        log.info(collection_w.indexes[0].params)
def apply_memory_stress(chaos_yaml):
    chaos_config = gen_experiment_config(chaos_yaml)
    log.debug(chaos_config)
    chaos_res = CusResource(kind=chaos_config['kind'],
                            group=constants.CHAOS_GROUP,
                            version=constants.CHAOS_VERSION,
                            namespace=constants.CHAOS_NAMESPACE)
    chaos_res.create(chaos_config)
    log.debug("chaos injected")
示例#5
0
    def test_chaos_memory_stress_querynode(self, connection, chaos_yaml):
        """
        target: explore query node behavior after memory stress chaos injected and recovered
        method: 1. Create a collection, insert some data
                2. Inject memory stress chaos
                3. Start a threas to load, search and query
                4. After chaos duration, check query search success rate
                5. Delete chaos or chaos finished finally
        expected: 1.If memory is insufficient, querynode is OOMKilled and available after restart
                  2.If memory is sufficient, succ rate of query and search both are 1.0
        """
        c_name = 'chaos_memory_nx6DNW4q'
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(c_name)
        log.debug(collection_w.schema)
        log.debug(collection_w._shards_num)

        # apply memory stress chaos
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace(
            'm', '*60+').replace('s', '*1+') + '+0'
        meta_name = chaos_config.get('metadata').get('name')
        # wait memory stress
        sleep(constants.WAIT_PER_OP * 2)

        # try to do release, load, query and serach in a duration time loop
        try:
            start = time.time()
            while time.time() - start < eval(duration):
                collection_w.release()
                collection_w.load()

                term_expr = f'{ct.default_int64_field_name} in {[random.randint(0, 100)]}'
                query_res, _ = collection_w.query(term_expr)
                assert len(query_res) == 1

                search_res, _ = collection_w.search(
                    cf.gen_vectors(1, ct.default_dim),
                    ct.default_float_vec_field_name, ct.default_search_params,
                    ct.default_limit)
                log.debug(search_res[0].ids)
                assert len(search_res[0].ids) == ct.default_limit

        except Exception as e:
            raise Exception(str(e))

        finally:
            chaos_res.delete(meta_name)
示例#6
0
    def test_memory_stress_replicas_group_insufficient(self, prepare_collection, mode):
        """
        target: test apply stress memory on different number querynodes and the group failed to load,
                bacause of the memory is insufficient
        method: 1.Limit querynodes memory 5Gi
                2.Create collection and insert 1000,000 entities
                3.Apply memory stress on querynodes and it's memory is not enough to load replicas
        expected: Verify load raise exception, and after delete chaos, load and search successfully
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")

        # Update config
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['stressors']['memory']['size'] = '5Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        # chaos_start = time.time()
        log.debug("chaos injected")
        sleep(10)

        try:
            # load failed
            err = {"err_code": 1, "err_msg": "shuffleSegmentsToQueryNodeV2: insufficient memory of available node"}
            collection_w.load(replica_number=5, timeout=60, check_task=CheckTasks.err_res, check_items=err)

            # query failed because not loaded
            err = {"err_code": 1, "err_msg": "not loaded into memory"}
            collection_w.query("int64 in [0]", check_task=CheckTasks.err_res, check_items=err)

            # delete chaos
            meta_name = chaos_config.get('metadata', None).get('name', None)
            chaos_res.delete(metadata_name=meta_name)
            sleep(10)

            # after delete chaos load and query successfully
            collection_w.load(replica_number=5, timeout=60)
            progress, _ = utility_w.loading_progress(collection_w.name)
            # assert progress["loading_progress"] == "100%"
            query_res, _ = collection_w.query("int64 in [0]")
            assert len(query_res) != 0

            collection_w.release()

        except Exception as e:
            raise Exception(str(e))

        finally:
            log.debug("Test finished")
示例#7
0
    def test_memory_stress_replicas_cross_group_load_balance(self, prepare_collection):
        """
        target: test apply memory stress on one group and no load balance cross replica groups
        method: 1.Limit all querynodes memory 6Gi
                2.Create and insert 1000,000 entities
                3.Load collection with two replicas
                4.Apply memory stress on one grooup 80%
        expected: Verify that load balancing across groups is not occurring
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()
        release_name = "mic-memory"

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        seg_info_before, _ = utility_w.get_query_segment_info(collection_w.name)

        # get the replica and random chaos querynode
        replicas, _ = collection_w.get_replicas()
        group_nodes = list(replicas.groups[0].group_nodes)
        label = f"app.kubernetes.io/instance={release_name}, app.kubernetes.io/component=querynode"
        querynode_id_pod_pair = get_querynode_id_pod_pairs("chaos-testing", label)
        group_nodes_pod = [querynode_id_pod_pair[node_id] for node_id in group_nodes]

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_replicas_memory_stress_pods.yaml")
        chaos_config['spec']['selector']['pods']['chaos-testing'] = group_nodes_pod
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug(f"Apply memory stress on querynode {group_nodes}, pod {group_nodes_pod}")

        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        sleep(eval(duration))

        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # Verfiy auto load loadbalance
        seg_info_after, _ = utility_w.get_query_segment_info(collection_w.name)
        seg_distribution_before = cf.get_segment_distribution(seg_info_before)
        seg_distribution_after = cf.get_segment_distribution(seg_info_after)
        for node_id in group_nodes:
            assert len(seg_distribution_before[node_id]) == len(seg_distribution_after[node_id])

        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
    def test_chaos_memory_stress_etcd(self, chaos_yaml):
        """
        target: test inject memory stress into all etcd pods
        method: 1.Deploy milvus and limit etcd memory resource 1Gi witl all mode
                2.Continuously and concurrently do milvus operations
                3.Inject memory stress chaos 51024Mi
                4.After duration, delete chaos stress
        expected: Verify milvus operation succ rate
        """
        mic_checkers = {
            Op.create: CreateChecker(),
            Op.insert: InsertFlushChecker(),
            Op.flush: InsertFlushChecker(flush=True),
            Op.index: IndexChecker(),
            Op.search: SearchChecker(),
            Op.query: QueryChecker()
        }
        # start thread keep running milvus op
        start_monitor_threads(mic_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        # duration = chaos_config["spec"]["duration"]
        meta_name = chaos_config.get('metadata').get('name')
        duration = chaos_config.get('spec').get('duration')

        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("Chaos injected")

        # convert string duration time to a int number in seconds
        if isinstance(duration, str):
            duration = duration.replace('h', '*3600+').replace(
                'm', '*60+').replace('s', '*1+') + '+0'
        else:
            log.error("Duration must be string type")

        # Delete experiment after it's over
        timer = threading.Timer(interval=eval(duration),
                                function=chaos_res.delete,
                                args=(meta_name, False))
        timer.start()
        timer.join()

        # output milvus op succ rate
        for k, ch in mic_checkers.items():
            log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}')
            assert ch.succ_rate() == 1.0
示例#9
0
    def parser_testcase_config(self, chaos_yaml, chaos_config):
        # TODO: need a better way (maybe recursion) to parse chaos_config
        # selector key is located in different depth when chaos config's kind is different
        # for now, there are two kinds of chaos config: xxChaos and Schedule(applied in pod kill chaos).
        if chaos_config["kind"] == "Schedule":
            for k, v in chaos_config["spec"].items():
                if "Chaos" in k and "selector" in v.keys():
                    selector = v["selector"]
                    break
        else:
            selector = chaos_config["spec"]["selector"]
        log.info(f"chaos target selector: {selector}")
        tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
        tests_config = cc.gen_experiment_config(tests_yaml)
        test_collections = tests_config.get('Collections', None)
        for t in test_collections:
            test_chaos = t.get('testcase', {}).get('chaos', {})
            if test_chaos in chaos_yaml:
                expects = t.get('testcase',
                                {}).get('expectation',
                                        {}).get('cluster_1_node', {})
                # get the nums of pods
                namespace = selector["namespaces"][0]
                labels_dict = selector["labelSelectors"]
                labels_list = []
                for k, v in labels_dict.items():
                    labels_list.append(k + "=" + v)
                labels_str = ",".join(labels_list)
                pods = get_pod_list(namespace, labels_str)
                # for the cluster_n_node
                if len(pods) > 1:
                    expects = t.get('testcase',
                                    {}).get('expectation',
                                            {}).get('cluster_n_node', {})
                log.info(f"yaml.expects: {expects}")
                self.expect_create = expects.get(Op.create.value,
                                                 constants.SUCC)
                self.expect_insert = expects.get(Op.insert.value,
                                                 constants.SUCC)
                self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
                self.expect_index = expects.get(Op.index.value, constants.SUCC)
                self.expect_search = expects.get(Op.search.value,
                                                 constants.SUCC)
                self.expect_query = expects.get(Op.query.value, constants.SUCC)
                log.info(
                    f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
                    f"flush:{self.expect_flush}, index:{self.expect_index}, "
                    f"search:{self.expect_search}, query:{self.expect_query}")
                return True

        return False
def reboot_pod(chaos_yaml):
    # parse chaos object
    chaos_config = gen_experiment_config(chaos_yaml)
    log.debug(chaos_config)
    # inject chaos
    chaos_res = CusResource(kind=chaos_config['kind'],
                            group=constants.CHAOS_GROUP,
                            version=constants.CHAOS_VERSION,
                            namespace=constants.CHAOS_NAMESPACE)
    chaos_res.create(chaos_config)
    log.debug("chaos injected")
    sleep(7)
    # delete chaos
    meta_name = chaos_config.get('metadata', None).get('name', None)
    chaos_res.delete(meta_name)
    log.debug("chaos deleted")
示例#11
0
    def test_memory_stress_replicas_load_balance_single_node(self, prepare_collection):
        """
        target: test apply memory stress on single node replica, and it OOMKilled
        method: 1.Deploy 2 querynodes and limit memory 6Gi
                2.Loading 1000,000 entities (data_size=500Mb) with 2 replicas (memory_usage=1.5Gb)
                3.Apply memory stress on one querynode and make it OOMKilled
        expected: After deleting chaos, querynode turns running, search successfully
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        # load and searchc
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        query_res, _ = collection_w.query("int64 in [0]")
        assert len(query_res) != 0

        # apply memory stress
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")

        # Update config
        chaos_config['spec']['mode'] = "one"
        chaos_config['spec']['stressors']['memory']['size'] = '6Gi'
        chaos_config['spec']['duration'] = "1m"
        log.debug(chaos_config)
        duration = chaos_config.get('spec').get('duration')
        duration = duration.replace('h', '*3600+').replace('m', '*60+').replace('s', '*1+') + '+0'
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)

        sleep(eval(duration))
        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # release and load again
        collection_w.release()
        collection_w.load(replica_number=2)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == "100%"
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
    def test_chaos_memory_stress_datanode(self, chaos_yaml):
        """
        target: test inject memory stress into dataNode
        method: 1.Deploy milvus and limit datanode memory resource
                2.Create collection and insert some data
                3.Inject memory stress chaos
                4.Continue to insert data
        expected:
        """
        # init collection and insert 250 nb
        nb = 25000
        dim = 512
        c_name = cf.gen_unique_str('chaos_memory')
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(
            name=c_name, schema=cf.gen_default_collection_schema(dim=dim))
        for i in range(10):
            t0 = datetime.datetime.now()
            df = cf.gen_default_dataframe_data(nb=nb, dim=dim)
            res = collection_w.insert(df)[0]
            assert res.insert_count == nb
            log.info(
                f'After {i + 1} insert, num_entities: {collection_w.num_entities}'
            )
            tt = datetime.datetime.now() - t0
            log.info(f"{i} insert and flush data cost: {tt}")

        # inject memory stress
        chaos_config = gen_experiment_config(chaos_yaml)
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")

        # Continue to insert data
        collection_w.insert(df)
        log.info(f'Total num entities: {collection_w.num_entities}')

        # delete chaos
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(metadata_name=meta_name)
示例#13
0
    def test_chaos_memory_stress_replicas_OOM(self, prepare_collection, mode):
        """
        target: test apply memory stress during loading, and querynode OOMKilled
        method: 1.Deploy and limit querynode memory limit 6Gi
                2.Create collection and insert 1000,000 entities
                3.Apply memory stress and querynode OOMKilled during loading replicas
        expected: Verify the mic is available to load and search querynode restart
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['duration'] = '3m'
        chaos_config['spec']['stressors']['memory']['size'] = '6Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)

        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        collection_w.load(replica_number=2, timeout=60, _async=True)

        utility_w.wait_for_loading_complete(collection_w.name)
        progress, _ = utility_w.loading_progress(collection_w.name)
        assert progress["loading_progress"] == '100%'

        sleep(180)
        chaos_res.delete(metadata_name=chaos_config.get('metadata', None).get('name', None))

        # TODO search failed
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])

        collection_w.release()
        collection_w.load(replica_number=2)
        search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                            ct.default_float_vec_field_name, ct.default_search_params,
                                            ct.default_limit, timeout=120)
        assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
示例#14
0
    def test_memory_stress_replicas_group_sufficient(self, prepare_collection, mode):
        """
        target: test apply stress memory on one querynode and the memory is enough to load replicas
        method: 1.Limit all querynodes memory 6Gi
                2.Apply 3Gi memory stress on different number of querynodes (load whole collection need about 1.5GB)
        expected: Verify load successfully and search result are correct
        """
        collection_w = prepare_collection
        utility_w = ApiUtilityWrapper()

        # # apply memory stress chaos
        chaos_config = gen_experiment_config("./chaos_objects/memory_stress/chaos_querynode_memory_stress.yaml")
        chaos_config['spec']['mode'] = mode
        chaos_config['spec']['duration'] = '3m'
        chaos_config['spec']['stressors']['memory']['size'] = '3Gi'
        log.debug(chaos_config)
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.debug("chaos injected")
        sleep(20)
        #
        try:
            collection_w.load(replica_number=2, timeout=60)
            utility_w.loading_progress(collection_w.name)
            replicas, _ = collection_w.get_replicas()
            log.debug(replicas)
            search_res, _ = collection_w.search(cf.gen_vectors(1, dim=self.dim),
                                                ct.default_float_vec_field_name, ct.default_search_params,
                                                ct.default_limit, timeout=120)
            assert 1 == len(search_res) and ct.default_limit == len(search_res[0])
            collection_w.release()

        except Exception as e:
            raise Exception(str(e))

        finally:
            # delete chaos
            meta_name = chaos_config.get('metadata', None).get('name', None)
            chaos_res.delete(metadata_name=meta_name)
            log.debug("Test finished")
示例#15
0
    def parser_testcase_config(self, chaos_yaml):
        tests_yaml = constants.TESTS_CONFIG_LOCATION + 'testcases.yaml'
        tests_config = cc.gen_experiment_config(tests_yaml)
        test_collections = tests_config.get('Collections', None)
        ms = MilvusSys(alias="default")
        node_map = {
            "querynode": "query_nodes",
            "datanode": "data_nodes",
            "indexnode": "index_nodes",
            "proxy": "proxy_nodes"
        }
        for t in test_collections:
            test_chaos = t.get('testcase', {}).get('chaos', {})
            if test_chaos in chaos_yaml:
                expects = t.get('testcase',
                                {}).get('expectation',
                                        {}).get('cluster_1_node', {})
                # for cluster_n_node mode
                for node in node_map.keys():
                    if node in test_chaos and len(getattr(ms,
                                                          node_map[node])) > 1:
                        expects = t.get('testcase',
                                        {}).get('expectation',
                                                {}).get('cluster_n_node', {})
                log.info(f"yaml.expects: {expects}")
                self.expect_create = expects.get(Op.create.value,
                                                 constants.SUCC)
                self.expect_insert = expects.get(Op.insert.value,
                                                 constants.SUCC)
                self.expect_flush = expects.get(Op.flush.value, constants.SUCC)
                self.expect_index = expects.get(Op.index.value, constants.SUCC)
                self.expect_search = expects.get(Op.search.value,
                                                 constants.SUCC)
                self.expect_query = expects.get(Op.query.value, constants.SUCC)
                log.info(
                    f"self.expects: create:{self.expect_create}, insert:{self.expect_insert}, "
                    f"flush:{self.expect_flush}, index:{self.expect_index}, "
                    f"search:{self.expect_search}, query:{self.expect_query}")
                return True

        return False
    def test_multi_replicas_with_only_one_group_available(
            self, chaos_type, failed_node_type, failed_group_scope,
            is_streaming):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        # log.info(f"chaos_yaml: {chaos_yaml}")
        log.info(connections.get_connection_addr('default'))
        if is_streaming is False:
            del self.health_checkers[Op.insert]
        cc.start_monitor_threads(self.health_checkers)
        # get replicas info
        release_name = "milvus-multi-querynode"
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        group_list = []
        shard_leader_list = []
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        for g in replicas_info.groups:
            group_list.append(list(g.group_nodes))
            for shard in g.shards:
                shard_leader_list.append(shard.shard_leader)
        # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos,
        # In the effected groups, each group has one pod is in pod failure status
        target_pod_list = []
        target_group = []
        group_list = sorted(group_list, key=lambda x: -len(x))
        if failed_group_scope == "one":
            target_group = random.sample(group_list, 1)
        if failed_group_scope == "except_one":
            target_group = random.sample(group_list, len(group_list) - 1)
        if failed_group_scope == "all":
            target_group = group_list[:]
        for g in target_group:
            target_nodes = []
            if failed_node_type == "shard_leader":
                target_nodes = list(set(g) & set(shard_leader_list))
            if failed_node_type == "non_shard_leader":
                target_nodes = list(set(g) - set(shard_leader_list))
            for target_node in target_nodes:
                pod = querynode_id_pod_pair[target_node]
                target_pod_list.append(pod)
        log.info(f"target_pod_list: {target_pod_list}")
        chaos_config = cc.gen_experiment_config(
            f"chaos/chaos_objects/template/{chaos_type}-by-pod-list.yaml")
        chaos_config['metadata'][
            'name'] = f"test-multi-replicase-{int(time.time())}"
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_config['spec']['selector']['pods'][
            'chaos-testing'] = target_pod_list
        self._chaos_config = chaos_config  # cache the chaos config for tear down

        log.info(f"chaos_config: {chaos_config}")
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)

        chaos_res.create(chaos_config)
        log.info("chaos injected")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 120s
        sleep(constants.CHAOS_DURATION)
        log.info(f'Alive threads: {threading.enumerate()}')
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC}
        if failed_group_scope == "all":
            expectations = {
                Op.search: constants.FAIL,
                Op.query: constants.FAIL
            }
        assert_statistic(self.health_checkers, expectations=expectations)
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        sleep(2)
        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}"
        )
        ready_1 = wait_pods_ready(
            constants.CHAOS_NAMESPACE,
            f"app.kubernetes.io/instance={release_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}"
        )
        ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE,
                                  f"release={release_name}")
        if ready_1 and ready_2:
            log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        # cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        sleep(120)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )
        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
    def test_chaos_data_consist(self, connection, chaos_yaml):
        """
        target: verify data consistence after chaos injected and recovered
        method: 1. create a collection, insert some data, search and query
                2. inject a chaos object
                3. reconnect to service
                4. verify a) data entities persists, index persists,
                          b) search and query results persist
        expected: collection data and results persist
        """
        c_name = cf.gen_unique_str('chaos_collection_')
        nb = 5000
        i_name = cf.gen_unique_str('chaos_index_')
        index_params = {
            "index_type": "IVF_SQ8",
            "metric_type": "L2",
            "params": {
                "nlist": 64
            }
        }

        # create
        t0 = datetime.datetime.now()
        collection_w = ApiCollectionWrapper()
        collection_w.init_collection(name=c_name,
                                     schema=cf.gen_default_collection_schema())
        tt = datetime.datetime.now() - t0
        log.info(f"assert create: {tt}")
        assert collection_w.name == c_name

        # insert
        data = cf.gen_default_list_data(nb=nb)
        t0 = datetime.datetime.now()
        _, res = collection_w.insert(data)
        tt = datetime.datetime.now() - t0
        log.info(f"assert insert: {tt}")
        assert res

        # flush
        t0 = datetime.datetime.now()
        assert collection_w.num_entities == nb
        tt = datetime.datetime.now() - t0
        log.info(f"assert flush: {tt}")

        # search
        collection_w.load()
        search_vectors = cf.gen_vectors(1, ct.default_dim)
        t0 = datetime.datetime.now()
        search_params = {"metric_type": "L2", "params": {"nprobe": 16}}
        search_res, _ = collection_w.search(
            data=search_vectors,
            anns_field=ct.default_float_vec_field_name,
            param=search_params,
            limit=1)
        tt = datetime.datetime.now() - t0
        log.info(f"assert search: {tt}")
        assert len(search_res) == 1

        # index
        t0 = datetime.datetime.now()
        index, _ = collection_w.create_index(
            field_name=ct.default_float_vec_field_name,
            index_params=index_params,
            name=i_name)
        tt = datetime.datetime.now() - t0
        log.info(f"assert index: {tt}")
        assert len(collection_w.indexes) == 1

        # query
        term_expr = f'{ct.default_int64_field_name} in [1001,1201,999,99]'
        t0 = datetime.datetime.now()
        query_res, _ = collection_w.query(term_expr)
        tt = datetime.datetime.now() - t0
        log.info(f"assert query: {tt}")
        assert len(query_res) == 4

        # reboot a pod
        reboot_pod(chaos_yaml)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)

        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE,
                        f"app.kubernetes.io/instance={meta_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 3)
        reconnect(connections, alias='default')

        # verify collection persists
        assert utility.has_collection(c_name)
        log.info("assert collection persists")
        collection_w2 = ApiCollectionWrapper()
        collection_w2.init_collection(c_name)
        # verify data persist
        assert collection_w2.num_entities == nb
        log.info("assert data persists")
        # verify index persists
        assert collection_w2.has_index(i_name)
        log.info("assert index persists")
        # verify search results persist
        collection_w2.load()
        search_res, _ = collection_w.search(
            data=search_vectors,
            anns_field=ct.default_float_vec_field_name,
            param=search_params,
            limit=1)
        tt = datetime.datetime.now() - t0
        log.info(f"assert search: {tt}")
        assert len(search_res) == 1
        # verify query results persist
        query_res2, _ = collection_w2.query(term_expr)
        assert len(query_res2) == len(query_res)
        log.info("assert query result persists")
示例#18
0
class TestChaos(TestChaosBase):
    @pytest.fixture(scope="function", autouse=True)
    def connection(self, host, port):
        connections.add_connection(default={"host": host, "port": port})
        connections.connect(alias='default')

        if connections.has_connection("default") is False:
            raise Exception("no connections")
        self.host = host
        self.port = port
        self.instance_name = get_milvus_instance_name(
            constants.CHAOS_NAMESPACE, host)

    @pytest.fixture(scope="function", autouse=True)
    def init_health_checkers(self):
        c_name = cf.gen_unique_str('MultiReplicasChecker_')
        replicas_num = 2
        shards_num = 2
        checkers = {
            Op.insert:
            InsertFlushChecker(collection_name=c_name, shards_num=shards_num),
            Op.search:
            SearchChecker(collection_name=c_name,
                          shards_num=shards_num,
                          replica_number=replicas_num),
            Op.query:
            QueryChecker(collection_name=c_name,
                         shards_num=shards_num,
                         replica_number=replicas_num)
        }
        self.health_checkers = checkers

    def teardown(self):
        chaos_res = CusResource(kind=self._chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        meta_name = self._chaos_config.get('metadata', None).get('name', None)
        chaos_res.delete(meta_name, raise_ex=False)
        sleep(2)
        log.info(f'Alive threads: {threading.enumerate()}')

    @pytest.mark.tags(CaseLabel.L3)
    @pytest.mark.parametrize(
        "is_streaming",
        cc.gen_experiment_config(config_file_name)['is_streaming']
    )  # [False, True]
    @pytest.mark.parametrize(
        "failed_group_scope",
        cc.gen_experiment_config(config_file_name)['failed_group_scope']
    )  # ["one", "except_one" "all"]
    @pytest.mark.parametrize(
        "failed_node_type",
        cc.gen_experiment_config(config_file_name)['failed_node_type']
    )  # ["non_shard_leader", "shard_leader"]
    @pytest.mark.parametrize(
        "chaos_type",
        cc.gen_experiment_config(config_file_name)['chaos_type']
    )  # ["pod-failure", "pod-kill"]
    def test_multi_replicas_with_only_one_group_available(
            self, chaos_type, failed_node_type, failed_group_scope,
            is_streaming):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info("Test config")
        log.info(cc.gen_experiment_config(config_file_name))
        # log.info(f"chaos_yaml: {chaos_yaml}")
        log.info(connections.get_connection_addr('default'))
        if is_streaming is False:
            del self.health_checkers[Op.insert]
        cc.start_monitor_threads(self.health_checkers)
        # get replicas info
        release_name = self.instance_name
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        group_list = []
        shard_leader_list = []
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        for g in replicas_info.groups:
            group_list.append(list(g.group_nodes))
            for shard in g.shards:
                shard_leader_list.append(shard.shard_leader)
        # keep only one group in healthy status, other groups will be unhealthy by injecting pod failure chaos,
        # In the effected groups, each group has one pod is in pod failure status
        target_pod_list = []
        target_group = []
        group_list = sorted(group_list, key=lambda x: -len(x))
        if failed_group_scope == "one":
            target_group = random.sample(group_list, 1)
        if failed_group_scope == "except_one":
            target_group = random.sample(group_list, len(group_list) - 1)
        if failed_group_scope == "all":
            target_group = group_list[:]
        for g in target_group:
            target_nodes = []
            if failed_node_type == "shard_leader":
                target_nodes = list(set(g) & set(shard_leader_list))
            if failed_node_type == "non_shard_leader":
                target_nodes = list(set(g) - set(shard_leader_list))
            if len(target_nodes) == 0:
                log.info("there is no node satisfied, chose one randomly")
                target_nodes = [random.choice(g)]
            for target_node in target_nodes:
                pod = querynode_id_pod_pair[target_node]
                target_pod_list.append(pod)
        log.info(f"target_pod_list: {target_pod_list}")
        chaos_config = cc.gen_experiment_config(
            f"{str(Path(__file__).absolute().parent)}/chaos_objects/template/{chaos_type}-by-pod-list.yaml"
        )
        chaos_config['metadata'][
            'name'] = f"test-multi-replicase-{int(time.time())}"
        meta_name = chaos_config.get('metadata', None).get('name', None)
        chaos_config['spec']['selector']['pods'][
            'chaos-testing'] = target_pod_list
        self._chaos_config = chaos_config  # cache the chaos config for tear down

        log.info(f"chaos_config: {chaos_config}")
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)

        chaos_res.create(chaos_config)
        log.info("chaos injected")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 120s
        sleep(constants.CHAOS_DURATION)
        log.info(f'Alive threads: {threading.enumerate()}')
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for search collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )

        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for query collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        expectations = {Op.search: constants.SUCC, Op.query: constants.SUCC}
        if failed_group_scope == "all":
            expectations = {
                Op.search: constants.FAIL,
                Op.query: constants.FAIL
            }
        assert_statistic(self.health_checkers, expectations=expectations)
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        sleep(2)
        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}"
        )
        ready_1 = wait_pods_ready(
            constants.CHAOS_NAMESPACE,
            f"app.kubernetes.io/instance={release_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}"
        )
        ready_2 = wait_pods_ready(constants.CHAOS_NAMESPACE,
                                  f"release={release_name}")
        if ready_1 and ready_2:
            log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        # cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # node info
        querynode_id_pod_pair = get_querynode_info(release_name)
        log.info(querynode_id_pod_pair)
        sleep(30)
        # replicas info
        replicas_info, _ = self.health_checkers[
            Op.search].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.search].c_wrap.name}: {replicas_info}"
        )
        replicas_info, _ = self.health_checkers[Op.query].c_wrap.get_replicas()
        log.info(
            f"replicas_info for collection {self.health_checkers[Op.query].c_wrap.name}: {replicas_info}"
        )
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
示例#19
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        self.checker_threads = cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")

        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False
        # init report
        meta_name = chaos_config.get('metadata', None).get('name', None)
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("1st assert before chaos: ")
            f.write(f"{self.health_checkers}\n")
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2.1)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        for k, t in self.checker_threads.items():
            log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}")

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.create: self.expect_create,
                             Op.insert: self.expect_insert,
                             Op.flush: self.expect_flush,
                             Op.index: self.expect_index,
                             Op.search: self.expect_search,
                             Op.query: self.expect_query
                         })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:")
            f.write(f"{self.health_checkers}\n")
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        for k, t in self.checker_threads.items():
            log.info(f"Thread {k} is_alive(): {t.is_alive()}")
        sleep(2)

        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')

        # reset counting again
        cc.reset_counting(self.health_checkers)

        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)

        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:")
            f.write(f"{self.health_checkers}\n")
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")
示例#20
0
    def test_chaos(self, chaos_yaml):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        cc.start_monitor_threads(self.health_checkers)

        # parse chaos object
        chaos_config = cc.gen_experiment_config(chaos_yaml)
        meta_name = chaos_config.get('metadata', None).get('name', None)
        release_name = meta_name
        chaos_config_str = json.dumps(chaos_config)
        chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name)
        chaos_config = json.loads(chaos_config_str)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # parse the test expectations in testcases.yaml
        if self.parser_testcase_config(chaos_yaml, chaos_config) is False:
            log.error("Fail to get the testcase info in testcases.yaml")
            assert False

        # init report
        dir_name = "./reports"
        file_name = f"./reports/{meta_name}.log"
        if not os.path.exists(dir_name):
            os.makedirs(dir_name)
        # wait 20s
        sleep(constants.WAIT_PER_OP * 2)

        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            ts = time.strftime("%Y-%m-%d %H:%M:%S")
            f.write(f"{meta_name}-{ts}\n")
            f.write("1st assert before chaos:\n")
            f.write(record_results(self.health_checkers))
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        log.info(f"chaos information: {chaos_res.get(meta_name)}")
        sleep(constants.WAIT_PER_OP * 2)
        # reset counting
        cc.reset_counting(self.health_checkers)

        # wait 40s
        sleep(constants.CHAOS_DURATION)

        log.info(f'Alive threads: {threading.enumerate()}')

        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={Op.create: self.expect_create,
                                       Op.insert: self.expect_insert,
                                       Op.flush: self.expect_flush,
                                       Op.index: self.expect_index,
                                       Op.search: self.expect_search,
                                       Op.query: self.expect_query
                                       })
        with open(file_name, "a+") as f:
            f.write("2nd assert after chaos injected:\n")
            f.write(record_results(self.health_checkers))
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        log.info(f'Alive threads: {threading.enumerate()}')
        sleep(2)
        # wait all pods ready
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}")
        log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}")
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 5)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        with open(file_name, "a+") as f:
            f.write("3rd assert after chaos deleted:\n")
            f.write(record_results(self.health_checkers))
        # assert all expectations
        assert_expectations()

        log.info("*********************Chaos Test Completed**********************")
示例#21
0
    def test_bulk_load(self, chaos_type, target_component):
        # start the monitor threads to check the milvus ops
        log.info("*********************Chaos Test Start**********************")
        log.info(connections.get_connection_addr('default'))
        release_name = self.instance_name
        cc.start_monitor_threads(self.health_checkers)
        chaos_config = cc.gen_experiment_config(
            f"{str(Path(__file__).absolute().parent)}/chaos_objects/{chaos_type}/chaos_{target_component}_{chaos_type}.yaml"
        )
        chaos_config['metadata']['name'] = f"test-bulk-load-{int(time.time())}"
        kind = chaos_config['kind']
        meta_name = chaos_config.get('metadata', None).get('name', None)
        update_key_value(chaos_config, "release", release_name)
        update_key_value(chaos_config, "app.kubernetes.io/instance",
                         release_name)
        self._chaos_config = chaos_config  # cache the chaos config for tear down
        log.info(f"chaos_config: {chaos_config}")
        # wait 20s
        sleep(constants.WAIT_PER_OP * 10)
        # assert statistic:all ops 100% succ
        log.info("******1st assert before chaos: ")
        assert_statistic(self.health_checkers)
        # apply chaos object
        chaos_res = CusResource(kind=chaos_config['kind'],
                                group=constants.CHAOS_GROUP,
                                version=constants.CHAOS_VERSION,
                                namespace=constants.CHAOS_NAMESPACE)
        chaos_res.create(chaos_config)
        log.info("chaos injected")
        sleep(constants.WAIT_PER_OP * 10)
        # reset counting
        cc.reset_counting(self.health_checkers)
        # wait 120s
        sleep(constants.CHAOS_DURATION)
        log.info(f'Alive threads: {threading.enumerate()}')
        # assert statistic
        log.info("******2nd assert after chaos injected: ")
        assert_statistic(self.health_checkers,
                         expectations={
                             Op.bulk_load: constants.FAIL,
                         })
        # delete chaos
        chaos_res.delete(meta_name)
        log.info("chaos deleted")
        sleep(2)
        # wait all pods ready
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={release_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE,
                        f"app.kubernetes.io/instance={release_name}")
        log.info(
            f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={release_name}"
        )
        wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={release_name}")
        log.info("all pods are ready")
        # reconnect if needed
        sleep(constants.WAIT_PER_OP * 2)
        cc.reconnect(connections, alias='default')
        # recheck failed tasks in third assert
        self.health_checkers[Op.bulk_load].recheck_failed_task = True
        # reset counting again
        cc.reset_counting(self.health_checkers)
        # wait 50s (varies by feature)
        sleep(constants.WAIT_PER_OP * 10)
        # assert statistic: all ops success again
        log.info("******3rd assert after chaos deleted: ")
        assert_statistic(self.health_checkers)
        # assert all expectations
        assert_expectations()

        log.info(
            "*********************Chaos Test Completed**********************")