class TestChaos(TestChaosBase): @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) conn = connections.connect(alias='default') if conn is None: raise Exception("no connections") self.host = host self.port = port return conn @pytest.fixture(scope="function", autouse=True) def init_health_checkers(self): checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } self.health_checkers = checkers def teardown(self): chaos_res = CusResource(kind=self._chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) meta_name = self._chaos_config.get('metadata', None).get('name', None) chaos_res.delete(meta_name, raise_ex=False) sleep(2) log.info(f'Alive threads: {threading.enumerate()}') @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls()) def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) release_name = meta_name chaos_config_str = json.dumps(chaos_config) chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name) chaos_config = json.loads(chaos_config_str) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml, chaos_config) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: ts = time.strftime("%Y-%m-%d %H:%M:%S") f.write(f"{meta_name}-{ts}\n") f.write("1st assert before chaos:\n") f.write(record_results(self.health_checkers)) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:\n") f.write(record_results(self.health_checkers)) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") log.info(f'Alive threads: {threading.enumerate()}') sleep(2) # wait all pods ready log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:\n") f.write(record_results(self.health_checkers)) # assert all expectations assert_expectations() log.info("*********************Chaos Test Completed**********************")
class TestChaosData: @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) conn = connections.connect(alias='default') if conn is None: raise Exception("no connections") return conn @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_querynode(self, connection, chaos_yaml): """ target: explore query node behavior after memory stress chaos injected and recovered method: 1. create a collection, insert some data 2. inject memory stress chaos 3. load collection and search, query 4. todo (verify query node response) 5. delete chaos or chaos finished 6. release and reload collection, verify search and query is available expected: after chaos deleted, load, search and query are all available """ c_name = 'chaos_memory_nx6DNW4q' collection_w = ApiCollectionWrapper() collection_w.init_collection(c_name) log.debug(collection_w.schema) log.debug(collection_w._shards_num) # apply memory stress # apply_memory_stress(chaos_yaml) # wait memory stress # sleep(constants.WAIT_PER_OP * 2) # query collection_w.release() collection_w.load() term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]' for i in range(4): t0_query = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt_query = datetime.datetime.now() - t0_query log.info(f"{i} query cost: {tt_query}") assert len(query_res) == 4 @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_datanode(self, chaos_yaml): """ target: test inject memory stress into dataNode method: 1.Deploy milvus and limit datanode memory resource 2.Create collection and insert some data 3.Inject memory stress chaos 4.Continue to insert data expected: """ # init collection and insert 250 nb nb = 25000 dim = 512 c_name = cf.gen_unique_str('chaos_memory') collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema(dim=dim)) for i in range(10): t0 = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}') tt = datetime.datetime.now() - t0 log.info(f"{i} insert and flush data cost: {tt}") # inject memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") # Continue to insert data collection_w.insert(df) log.info(f'Total num entities: {collection_w.num_entities}') # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml): """ target: test inject memory stress into indexnode method: 1.Deploy milvus and limit indexnode memory resource 1Gi 2.Create collection and insert some data 3.Create index 4.Inject memory stress chaos 512Mi expected: """ # init collection and insert 250 nb nb = 50000 # vector size: 512*4*nb about 100Mi and create index need 600Mi memory dim = 512 c_name = cf.gen_unique_str('chaos_memory') index_params = {"index_type": "IVF_SQ8", "metric_type": "L2", "params": {"nlist": 128}} collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema(dim=dim), shards_num=1) # insert 256000 512 dim entities, size 512Mi for i in range(2): t0_insert = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb // 2 # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}') tt_insert = datetime.datetime.now() - t0_insert log.info(f"{i} insert data cost: {tt_insert}") # flush t0_flush = datetime.datetime.now() assert collection_w.num_entities == nb tt_flush = datetime.datetime.now() - t0_flush log.info(f'flush {nb * 10} entities cost: {tt_flush}') # create index t0_index = datetime.datetime.now() index, _ = collection_w.create_index(field_name=ct.default_float_vec_field_name, index_params=index_params) tt_index = datetime.datetime.now() - t0_index log.info(f"create index cost: {tt_index}") log.info(collection_w.indexes) # indexNode start build index, inject chaos memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("inject chaos") @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls()) def test_chaos_memory_stress_etcd(self, chaos_yaml): mic_checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } # start thread keep running milvus op start_monitor_threads(mic_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) # duration = chaos_config["spec"]["duration"] meta_name = chaos_config.get('metadata').get('name') duration = chaos_config.get('spec').get('duration') # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("Chaos injected") # convert string duration time to a int number in seconds if isinstance(duration, str): duration = duration.replace('h', '*3600+') duration = duration.replace('m', '*60+') duration = duration.replace('s', '*1') else: log.error("Duration must be string type") # Delete experiment after it's over timer = threading.Timer(interval=eval(duration), function=chaos_res.delete, args=(meta_name, False)) timer.start() timer.join() # output milvus op succ rate for k, ch in mic_checkers.items(): log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}')
class TestChaosData: @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) connections.connect(alias='default') if connections.has_connection("default") is False: raise Exception("no connections") @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_data_consist(self, connection, chaos_yaml): """ target: verify data consistence after chaos injected and recovered method: 1. create a collection, insert some data, search and query 2. inject a chaos object 3. reconnect to service 4. verify a) data entities persists, index persists, b) search and query results persist expected: collection data and results persist """ c_name = cf.gen_unique_str('chaos_collection_') nb = 5000 i_name = cf.gen_unique_str('chaos_index_') index_params = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } # create t0 = datetime.datetime.now() collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema()) tt = datetime.datetime.now() - t0 log.info(f"assert create: {tt}") assert collection_w.name == c_name # insert data = cf.gen_default_list_data(nb=nb) t0 = datetime.datetime.now() _, res = collection_w.insert(data) tt = datetime.datetime.now() - t0 log.info(f"assert insert: {tt}") assert res # flush t0 = datetime.datetime.now() assert collection_w.num_entities == nb tt = datetime.datetime.now() - t0 log.info(f"assert flush: {tt}") # search collection_w.load() search_vectors = cf.gen_vectors(1, ct.default_dim) t0 = datetime.datetime.now() search_params = {"metric_type": "L2", "params": {"nprobe": 16}} search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # index t0 = datetime.datetime.now() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=index_params, name=i_name) tt = datetime.datetime.now() - t0 log.info(f"assert index: {tt}") assert len(collection_w.indexes) == 1 # query term_expr = f'{ct.default_int64_field_name} in [1001,1201,999,99]' t0 = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt = datetime.datetime.now() - t0 log.info(f"assert query: {tt}") assert len(query_res) == 4 # reboot a pod reboot_pod(chaos_yaml) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) # wait all pods ready log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info( f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}" ) wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 3) reconnect(connections, alias='default') # verify collection persists assert utility.has_collection(c_name) log.info("assert collection persists") collection_w2 = ApiCollectionWrapper() collection_w2.init_collection(c_name) # verify data persist assert collection_w2.num_entities == nb log.info("assert data persists") # verify index persists assert collection_w2.has_index(i_name) log.info("assert index persists") # verify search results persist collection_w2.load() search_res, _ = collection_w.search( data=search_vectors, anns_field=ct.default_float_vec_field_name, param=search_params, limit=1) tt = datetime.datetime.now() - t0 log.info(f"assert search: {tt}") assert len(search_res) == 1 # verify query results persist query_res2, _ = collection_w2.query(term_expr) assert len(query_res2) == len(query_res) log.info("assert query result persists")
class TestChaosData: @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) conn = connections.connect(alias='default') if conn is None: raise Exception("no connections") return conn @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_querynode(self, connection, chaos_yaml): """ target: explore query node behavior after memory stress chaos injected and recovered method: 1. Create a collection, insert some data 2. Inject memory stress chaos 3. Start a threas to load, search and query 4. After chaos duration, check query search success rate 5. Delete chaos or chaos finished finally expected: 1.If memory is insufficient, querynode is OOMKilled and available after restart 2.If memory is sufficient, succ rate of query and search both are 1.0 """ c_name = 'chaos_memory_nx6DNW4q' collection_w = ApiCollectionWrapper() collection_w.init_collection(c_name) log.debug(collection_w.schema) log.debug(collection_w._shards_num) # apply memory stress chaos chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") duration = chaos_config.get('spec').get('duration') duration = duration.replace('h', '*3600+').replace( 'm', '*60+').replace('s', '*1+') + '+0' meta_name = chaos_config.get('metadata').get('name') # wait memory stress sleep(constants.WAIT_PER_OP * 2) # try to do release, load, query and serach in a duration time loop try: start = time.time() while time.time() - start < eval(duration): collection_w.release() collection_w.load() term_expr = f'{ct.default_int64_field_name} in {[random.randint(0, 100)]}' query_res, _ = collection_w.query(term_expr) assert len(query_res) == 1 search_res, _ = collection_w.search( cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit) log.debug(search_res[0].ids) assert len(search_res[0].ids) == ct.default_limit except Exception as e: raise Exception(str(e)) finally: chaos_res.delete(meta_name) @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_datanode(self, chaos_yaml): """ target: test inject memory stress into dataNode method: 1.Deploy milvus and limit datanode memory resource 2.Create collection and insert some data 3.Inject memory stress chaos 4.Continue to insert data expected: """ # init collection and insert 250 nb nb = 25000 dim = 512 c_name = cf.gen_unique_str('chaos_memory') collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(dim=dim)) for i in range(10): t0 = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb log.info( f'After {i + 1} insert, num_entities: {collection_w.num_entities}' ) tt = datetime.datetime.now() - t0 log.info(f"{i} insert and flush data cost: {tt}") # inject memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") # Continue to insert data collection_w.insert(df) log.info(f'Total num entities: {collection_w.num_entities}') # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml): """ target: test inject memory stress into indexnode method: 1.Deploy milvus and limit indexnode memory resource 3 / 4Gi 2.Create collection and insert some data 3.Inject memory stress chaos 512Mi 4.Create index expected: """ # init collection and insert nb = 256000 # vector size: 512*4*nb about 512Mi and create index need 2.8Gi memory dim = 512 # c_name = cf.gen_unique_str('chaos_memory') c_name = 'chaos_memory_gKs8aSUu' index_params = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 128 } } collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(dim=dim), shards_num=1) # insert 256000 512 dim entities, size 512Mi for i in range(2): t0_insert = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb // 2 # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}') tt_insert = datetime.datetime.now() - t0_insert log.info(f"{i} insert data cost: {tt_insert}") # flush t0_flush = datetime.datetime.now() assert collection_w.num_entities == nb tt_flush = datetime.datetime.now() - t0_flush log.info(f'flush {nb * 10} entities cost: {tt_flush}') log.info(collection_w.indexes[0].params) if collection_w.has_index()[0]: collection_w.drop_index() # indexNode start build index, inject chaos memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("inject chaos") # create index t0_index = datetime.datetime.now() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=index_params) tt_index = datetime.datetime.now() - t0_index log.info(f"create index cost: {tt_index}") log.info(collection_w.indexes[0].params) @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls()) def test_chaos_memory_stress_etcd(self, chaos_yaml): """ target: test inject memory stress into all etcd pods method: 1.Deploy milvus and limit etcd memory resource 1Gi witl all mode 2.Continuously and concurrently do milvus operations 3.Inject memory stress chaos 51024Mi 4.After duration, delete chaos stress expected: Verify milvus operation succ rate """ mic_checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } # start thread keep running milvus op start_monitor_threads(mic_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) # duration = chaos_config["spec"]["duration"] meta_name = chaos_config.get('metadata').get('name') duration = chaos_config.get('spec').get('duration') # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("Chaos injected") # convert string duration time to an int number in seconds if isinstance(duration, str): duration = duration.replace('h', '*3600+').replace( 'm', '*60+').replace('s', '*1+') + '+0' else: log.error("Duration must be string type") # Delete experiment after it's over timer = threading.Timer(interval=eval(duration), function=chaos_res.delete, args=(meta_name, False)) timer.start() timer.join() # output milvus op succ rate for k, ch in mic_checkers.items(): log.debug(f'Succ rate of {k.value}: {ch.succ_rate()}') assert ch.succ_rate() == 1.0
class TestChaosData: @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) conn = connections.connect(alias='default') if conn is None: raise Exception("no connections") return conn @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_querynode(self, connection, chaos_yaml): """ target: explore querynode behavior after memory stress chaos injected and recovered method: 1. create a collection, insert some data 2. inject memory stress chaos 3. load collection and search, query 4. todo (verify querynode response) 5. delete chaos or chaos finished 6. release and reload collection, verify search and query is available expected: after chaos deleted, load, search and query qre both available """ c_name = cf.gen_unique_str('chaos_memory') collection_w = construct_from_data(c_name) log.debug(collection_w.schema) # apply memory stress apply_memory_stress(chaos_yaml) # wait memory stress sleep(constants.WAIT_PER_OP * 2) # query collection_w.release() collection_w.load() term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]' t0 = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt = datetime.datetime.now() - t0 log.info(f"assert query: {tt}") assert len(query_res) == 4 sleep(constants.WAIT_PER_OP * 5) # query collection_w.release() collection_w.load() term_expr = f'{ct.default_int64_field_name} in [0, 1, 999, 99]' t0 = datetime.datetime.now() query_res, _ = collection_w.query(term_expr) tt = datetime.datetime.now() - t0 log.info(f"assert query: {tt}") assert len(query_res) == 4 @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_datanode(self, chaos_yaml): """ target: test inject memory stress into dataNode method: 1.Deploy milvus and limit datanode memory resource 2.Create collection and insert some data 3.Inject memory stress chaos 4.Continue to insert data expected: """ # init collection and insert 250 nb nb = 25000 dim = 512 c_name = cf.gen_unique_str('chaos_memory') collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(dim=dim)) for i in range(10): t0 = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb log.info( f'After {i+1} insert, num_entities: {collection_w.num_entities}' ) tt = datetime.datetime.now() - t0 log.info(f"{i} insert and flush data cost: {tt}") # inject memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("chaos injected") # Continue to insert data collection_w.insert(df) log.info(f'Total num entities: {collection_w.num_entities}') # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_res.delete(metadata_name=meta_name) @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', get_chaos_yamls()) def test_chaos_memory_stress_indexnode(self, connection, chaos_yaml): """ target: test inject memory stress into indexnode method: 1.Deploy milvus and limit indexnode memory resource 1Gi 2.Create collection and insert some data 3.Create index 4.Inject memory stress chaos 512Mi expected: """ # init collection and insert 250 nb nb = 50000 # vector size: 512*4*nb about 100Mi and create index need 600Mi memory dim = 512 c_name = cf.gen_unique_str('chaos_memory') index_params = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 128 } } collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(dim=dim), shards_num=1) # insert 256000 512 dim entities 512Mi for i in range(2): t0_insert = datetime.datetime.now() df = cf.gen_default_dataframe_data(nb=nb // 2, dim=dim) res = collection_w.insert(df)[0] assert res.insert_count == nb // 2 # log.info(f'After {i + 1} insert, num_entities: {collection_w.num_entities}') tt_insert = datetime.datetime.now() - t0_insert log.info(f"{i} insert data cost: {tt_insert}") # flush t0_flush = datetime.datetime.now() assert collection_w.num_entities == nb tt_flush = datetime.datetime.now() - t0_flush log.info(f'flush {nb * 10} entities cost: {tt_flush}') # create index # index t0_index = datetime.datetime.now() index, _ = collection_w.create_index( field_name=ct.default_float_vec_field_name, index_params=index_params) tt_index = datetime.datetime.now() - t0_index log.info(f"create index cost: {tt_index}") log.info(collection_w.indexes) # indexNode start build index, inject chaos memory stress chaos_config = gen_experiment_config(chaos_yaml) log.debug(chaos_config) chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.debug("inject chaos")
class TestChaos(TestChaosBase): @pytest.fixture(scope="function", autouse=True) def connection(self, host, port): connections.add_connection(default={"host": host, "port": port}) conn = connections.connect(alias='default') if conn is None: raise Exception("no connections") self.host = host self.port = port return conn @pytest.fixture(scope="function", autouse=True) def init_health_checkers(self, connection): checkers = { Op.create: CreateChecker(), Op.insert: InsertFlushChecker(), Op.flush: InsertFlushChecker(flush=True), Op.index: IndexChecker(), Op.search: SearchChecker(), Op.query: QueryChecker() } self.health_checkers = checkers def teardown(self): chaos_res = CusResource(kind=self._chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) meta_name = self._chaos_config.get('metadata', None).get('name', None) chaos_res.delete(meta_name, raise_ex=False) for k, ch in self.health_checkers.items(): ch.terminate() log.info(f"tear down: checker {k} terminated") sleep(2) for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") @pytest.mark.tags(CaseLabel.L3) @pytest.mark.parametrize('chaos_yaml', cc.get_chaos_yamls()) def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) self.checker_threads = cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report meta_name = chaos_config.get('metadata', None).get('name', None) dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("1st assert before chaos: ") f.write(f"{self.health_checkers}\n") # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2.1) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) for k, t in self.checker_threads.items(): log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:") f.write(f"{self.health_checkers}\n") # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:") f.write(f"{self.health_checkers}\n") # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")