def export_pod_logs(namespace, label_selector, release_name=None): """ export pod logs with label selector to '/tmp/milvus' :param namespace: the namespace where the release :type namespace: str :param label_selector: labels to restrict which pods logs to export :type label_selector: str :param release_name: use the release name as server logs director name :type label_selector: str :example: >>> export_pod_logs("chaos-testing", "app.kubernetes.io/instance=mic-milvus") """ if isinstance(release_name, str): if len(release_name.strip()) == 0: raise ValueError("Got an unexpected space release_name") else: raise TypeError("Got an unexpected non-string release_name") pod_log_path = '/tmp/milvus_logs' if release_name is None else f'/tmp/milvus_logs/{release_name}' if not os.path.isdir(pod_log_path): os.makedirs(pod_log_path) # get pods and export logs items = get_pod_list(namespace, label_selector=label_selector) try: for item in items: pod_name = item.metadata.name os.system( f'kubectl logs {pod_name} > {pod_log_path}/{pod_name}.log 2>&1' ) except Exception as e: log.error(f"Exception when export pod {pod_name} logs: %s\n" % e) raise Exception(str(e))
def _update_configs(configs, template=None): """ Method: update the template with customized configs Params: configs: a dict type of configurations that describe the properties of milvus to be deployed template: Optional. Pass the template file location if there is a template to apply Return: a dict type customized configs """ if not isinstance(configs, dict): log.error("customize configurations must be in dict type") return None if template is None: d_configs = benedict() d_configs['apiVersion'] = f'{MILVUS_GRP}/{MILVUS_VER}' d_configs['kind'] = MILVUS_KIND else: d_configs = benedict.from_yaml(template) for key in configs.keys(): d_configs[key] = configs[key] # return a python dict if it is not none return d_configs._dict if d_configs._dict is not None else d_configs
def check_query_results(query_res, func_name, check_items): """ According to the check_items to check actual query result, which return from func_name. :param: query_res: A list that contains all results :type: list :param func_name: Query API name :type func_name: str :param check_items: The items expected to be checked, including exp_res, with_vec The type of exp_res value is as same as query_res The type of with_vec value is bool, True value means check vector field, False otherwise :type check_items: dict """ if func_name != 'query': log.warning("The function name is {} rather than {}".format( func_name, "query")) if not isinstance(query_res, list): raise Exception("The query result to check isn't list type object") if len(check_items) == 0: raise Exception("No expect values found in the check task") exp_res = check_items.get("exp_res", None) with_vec = check_items.get("with_vec", False) primary_field = check_items.get("primary_field", None) if exp_res is not None: if isinstance(query_res, list): assert pc.equal_entities_list(exp=exp_res, actual=query_res, primary_field=primary_field, with_vec=with_vec) return True else: log.error(f"Query result {query_res} is not list") return False log.warning(f'Expected query result is {exp_res}')
def read_pod_log(namespace, label_selector, release_name): init_k8s_client_config() items = get_pod_list(namespace, label_selector=label_selector) try: # export log to /tmp/release_name path pod_log_path = f'/tmp/milvus_logs/{release_name}' if not os.path.isdir(pod_log_path): os.makedirs(pod_log_path) api_instance = client.CoreV1Api() for item in items: pod = item.metadata.name log.debug(f'Start to read {pod} log') logs = api_instance.read_namespaced_pod_log(name=pod, namespace=namespace, async_req=True) with open(f'{pod_log_path}/{pod}.log', "w") as f: f.write(logs.get()) except ApiException as e: log.error(f"Exception when read pod {pod} logs: %s\n" % e) raise Exception(str(e))
def check_search_results(search_res, func_name, check_items): """ target: check the search results method: 1. check the query number 2. check the limit(topK) and ids 3. check the distance expected: check the search is ok """ log.info("search_results_check: checking the searching results") if func_name != 'search': log.warning("The function name is {} rather than {}".format( func_name, "search")) if len(check_items) == 0: raise Exception("No expect values found in the check task") if check_items.get("_async", None): if check_items["_async"]: search_res.done() search_res = search_res.result() if len(search_res) != check_items["nq"]: log.error("search_results_check: Numbers of query searched (%d) " "is not equal with expected (%d)" % (len(search_res), check_items["nq"])) assert len(search_res) == check_items["nq"] else: log.info( "search_results_check: Numbers of query searched is correct") for hits in search_res: if (len(hits) != check_items["limit"]) \ or (len(hits.ids) != check_items["limit"]): log.error("search_results_check: limit(topK) searched (%d) " "is not equal with expected (%d)" % (len(hits), check_items["limit"])) assert len(hits) == check_items["limit"] assert len(hits.ids) == check_items["limit"] else: if check_items.get("ids", None) is not None: ids_match = pc.list_contain_check(list(hits.ids), list(check_items["ids"])) if not ids_match: log.error( "search_results_check: ids searched not match") assert ids_match else: pass # just check nq and topk, not specific ids need check log.info("search_results_check: limit (topK) and " "ids searched for %d queries are correct" % len(search_res)) return True
def ip_check(ip): if ip == "localhost": return True if not isinstance(ip, str): log.error("[IP_CHECK] IP(%s) is not a string." % ip) return False _list = ip.split('.') if len(_list) != 4: log.error("[IP_CHECK] IP(%s) is wrong, please check manually." % ip) return False for i in _list: if not str(i).isdigit(): log.error("[IP_CHECK] IP(%s) is wrong, please check manually." % ip) return False return True
def check_content(request): log.error("^" * 50) log.error("check_content") return request.config.getoption("--check_content")
def restart_server(helm_release_name): res = True timeout = 120 from kubernetes import client, config client.rest.logger.setLevel(log.WARNING) # service_name = "%s.%s.svc.cluster.local" % (helm_release_name, namespace) config.load_kube_config() v1 = client.CoreV1Api() pod_name = None # config_map_names = v1.list_namespaced_config_map(namespace, pretty='true') # body = {"replicas": 0} pods = v1.list_namespaced_pod(namespace) for i in pods.items: if i.metadata.name.find( helm_release_name) != -1 and i.metadata.name.find( "mysql") == -1: pod_name = i.metadata.name break # v1.patch_namespaced_config_map(config_map_name, namespace, body, pretty='true') # status_res = v1.read_namespaced_service_status(helm_release_name, namespace, pretty='true') log.debug("Pod name: %s" % pod_name) if pod_name is not None: try: v1.delete_namespaced_pod(pod_name, namespace) except Exception as e: log.error(str(e)) log.error( "Exception when calling CoreV1Api->delete_namespaced_pod") res = False return res log.error("Sleep 10s after pod deleted") time.sleep(10) # check if restart successfully pods = v1.list_namespaced_pod(namespace) for i in pods.items: pod_name_tmp = i.metadata.name log.error(pod_name_tmp) if pod_name_tmp == pod_name: continue elif pod_name_tmp.find(helm_release_name ) == -1 or pod_name_tmp.find("mysql") != -1: continue else: status_res = v1.read_namespaced_pod_status(pod_name_tmp, namespace, pretty='true') log.error(status_res.status.phase) start_time = time.time() ready_break = False while time.time() - start_time <= timeout: log.error(time.time()) status_res = v1.read_namespaced_pod_status(pod_name_tmp, namespace, pretty='true') if status_res.status.phase == "Running": log.error("Already running") ready_break = True time.sleep(10) break else: time.sleep(1) if time.time() - start_time > timeout: log.error("Restart pod: %s timeout" % pod_name_tmp) res = False return res if ready_break: break else: raise Exception("Pod: %s not found" % pod_name) follow = True pretty = True previous = True # bool | Return previous terminated container logs. Defaults to false. (optional) since_seconds = 56 # int | A relative time in seconds before the current time from which to show logs. If this value precedes the time a pod was started, only logs since the pod start will be returned. If this value is in the future, no logs will be returned. Only one of sinceSeconds or sinceTime may be specified. (optional) timestamps = True # bool | If true, add an RFC3339 or RFC3339Nano timestamp at the beginning of every line of log output. Defaults to false. (optional) container = "milvus" # start_time = time.time() # while time.time() - start_time <= timeout: # try: # api_response = v1.read_namespaced_pod_log(pod_name_tmp, namespace, container=container, follow=follow, # pretty=pretty, previous=previous, since_seconds=since_seconds, # timestamps=timestamps) # log.error(api_response) # return res # except Exception as e: # log.error("Exception when calling CoreV1Api->read_namespaced_pod_log: %s\n" % e) # # waiting for server start # time.sleep(5) # # res = False # # return res # if time.time() - start_time > timeout: # log.error("Restart pod: %s timeout" % pod_name_tmp) # res = False return res
def test_scale_data_node(self): """ target: test scale dataNode method: 1.deploy milvus cluster with 2 dataNode 2.create collection with shards_num=5 3.continuously insert new data (daemon thread) 4.expand dataNode from 2 to 5 5.create new collection with shards_num=2 6.continuously insert new collection new data (daemon thread) 7.shrink dataNode from 5 to 3 expected: Verify milvus remains healthy, Insert and flush successfully during scale Average dataNode memory usage """ release_name = "scale-data" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' fail_count = 0 data_config = { 'metadata.namespace': constants.NAMESPACE, 'metadata.name': release_name, 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.dataNode.replicas': 2, 'spec.config.dataCoord.enableCompaction': True, 'spec.config.dataCoord.enableGarbageCollection': True } mic = MilvusOperator() mic.install(data_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1200): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: # log.warning(f'Deploy {release_name} timeout and ready to uninstall') # mic.uninstall(release_name, namespace=constants.NAMESPACE) raise BaseException(f'Milvus healthy timeout 1200s') try: # connect connections.add_connection(default={"host": host, "port": 19530}) connections.connect(alias='default') # create c_name = cf.gen_unique_str("scale_query") # c_name = 'scale_query_DymS7kI4' collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(), shards_num=5) tmp_nb = 10000 def do_insert(): while True: tmp_df = cf.gen_default_dataframe_data(tmp_nb) collection_w.insert(tmp_df) log.debug(collection_w.num_entities) t_insert = threading.Thread(target=do_insert, args=(), daemon=True) t_insert.start() # scale dataNode to 5 mic.upgrade(release_name, {'spec.components.dataNode.replicas': 5}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.debug("Expand dataNode test finished") # create new collection and insert new_c_name = cf.gen_unique_str("scale_query") collection_w_new = ApiCollectionWrapper() collection_w_new.init_collection( name=new_c_name, schema=cf.gen_default_collection_schema(), shards_num=2) def do_new_insert(): while True: tmp_df = cf.gen_default_dataframe_data(tmp_nb) collection_w_new.insert(tmp_df) log.debug(collection_w_new.num_entities) t_insert_new = threading.Thread(target=do_new_insert, args=(), daemon=True) t_insert_new.start() # scale dataNode to 3 mic.upgrade(release_name, {'spec.components.dataNode.replicas': 3}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.debug(collection_w.num_entities) time.sleep(300) log.debug("Shrink dataNode test finished") except Exception as e: log.error(str(e)) fail_count += 1 # raise Exception(str(e)) finally: log.info(f'Test finished with {fail_count} fail request') assert fail_count <= 1 label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)
def test_scale_query_node(self): """ target: test scale queryNode method: 1.deploy milvus cluster with 1 queryNode 2.prepare work (connect, create, insert, index and load) 3.continuously search (daemon thread) 4.expand queryNode from 2 to 5 5.continuously insert new data (daemon thread) 6.shrink queryNode from 5 to 3 expected: Verify milvus remains healthy and search successfully during scale """ fail_count = 0 release_name = "scale-query" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' query_config = { 'metadata.namespace': constants.NAMESPACE, 'metadata.name': release_name, 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.queryNode.replicas': 1, 'spec.config.dataCoord.enableCompaction': True, 'spec.config.dataCoord.enableGarbageCollection': True } mic = MilvusOperator() mic.install(query_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1200): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: # log.warning(f'Deploy {release_name} timeout and ready to uninstall') # mic.uninstall(release_name, namespace=constants.NAMESPACE) raise BaseException(f'Milvus healthy timeout 1200s') try: # connect connections.add_connection(default={"host": host, "port": 19530}) connections.connect(alias='default') # create c_name = cf.gen_unique_str("scale_query") # c_name = 'scale_query_DymS7kI4' collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name, schema=cf.gen_default_collection_schema(), shards_num=2) # insert two segments for i in range(3): df = cf.gen_default_dataframe_data(nb) collection_w.insert(df) log.debug(collection_w.num_entities) # create index collection_w.create_index(ct.default_float_vec_field_name, default_index_params) assert collection_w.has_index()[0] assert collection_w.index()[0] == Index(collection_w.collection, ct.default_float_vec_field_name, default_index_params) # load collection_w.load() # scale queryNode to 5 mic.upgrade(release_name, {'spec.components.queryNode.replicas': 5}, constants.NAMESPACE) # continuously search def do_search(): while True: search_res, _ = collection_w.search(cf.gen_vectors(1, ct.default_dim), ct.default_float_vec_field_name, ct.default_search_params, ct.default_limit) log.debug(search_res[0].ids) assert len(search_res[0].ids) == ct.default_limit t_search = threading.Thread(target=do_search, args=(), daemon=True) t_search.start() # wait new QN running, continuously insert mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") def do_insert(): while True: tmp_df = cf.gen_default_dataframe_data(1000) collection_w.insert(tmp_df) t_insert = threading.Thread(target=do_insert, args=(), daemon=True) t_insert.start() log.debug(collection_w.num_entities) time.sleep(20) log.debug("Expand querynode test finished") mic.upgrade(release_name, {'spec.components.queryNode.replicas': 3}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.debug(collection_w.num_entities) time.sleep(60) log.debug("Shrink querynode test finished") except Exception as e: log.error(str(e)) fail_count += 1 # raise Exception(str(e)) finally: log.info(f'Test finished with {fail_count} fail request') assert fail_count <= 1 label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)
def test_scale_proxy(self): """ target: test milvus operation after proxy expand method: 1.deploy 1 proxy replicas 2.milvus e2e test in parallel 3.expand proxy pod from 1 to 5 4.milvus e2e test 5.shrink proxy from 5 to 2 expected: 1.verify data consistent and func work """ # deploy milvus cluster with one proxy fail_count = 0 release_name = "scale-proxy" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' data_config = { 'metadata.namespace': constants.NAMESPACE, 'metadata.name': release_name, 'spec.mode': 'cluster', 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.proxy.replicas': 1, 'spec.components.dataNode.replicas': 2, 'spec.config.common.retentionDuration': 60 } mic = MilvusOperator() mic.install(data_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1800): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: raise MilvusException(message=f'Milvus healthy timeout 1800s') try: c_name = cf.gen_unique_str("proxy_scale") e2e_milvus_parallel(2, host, c_name) log.info('Milvus test before expand') # expand proxy replicas from 1 to 5 mic.upgrade(release_name, {'spec.components.proxy.replicas': 5}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") e2e_milvus_parallel(5, host, c_name) log.info('Milvus test after expand') # expand proxy replicas from 5 to 2 mic.upgrade(release_name, {'spec.components.proxy.replicas': 2}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") e2e_milvus_parallel(2, host, c_name) log.info('Milvus test after shrink') connections.connect('default', host=host, port=19530) collection_w = ApiCollectionWrapper() collection_w.init_collection(name=c_name) """ total start 2+5+2 process to run e2e, each time insert default_nb data, But one of the 2 processes started for the first time did not insert due to collection creation exception. So actually insert eight times """ assert collection_w.num_entities == 8 * default_nb except Exception as e: log.error(str(e)) fail_count += 1 # raise Exception(str(e)) finally: log.info(f'Test finished with {fail_count} fail request') assert fail_count <= 1 label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)
def dict_equal_check(dict1, dict2): if not isinstance(dict1, dict) or not isinstance(dict2, dict): log.error("[DICT_EQUAL_CHECK] Type of dict(%s) or dict(%s) is not a dict." % (str(dict1), str(dict2))) return False return operator.eq(dict1, dict2)
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.debug("*********************Chaos Test Start**********************") log.debug(connections.get_connection_addr('default')) self.checker_threads = start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.debug(chaos_config) # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # wait 120s sleep(constants.WAIT_PER_OP*2) # assert statistic:all ops 100% succ log.debug("******1st assert before chaos: ") assert_statistic(self.health_checkers) # apply chaos object chaos_opt = ChaosOpt(chaos_config['kind']) chaos_opt.create_chaos_object(chaos_config) log.debug("chaos injected") sleep(constants.WAIT_PER_OP * 2.1) # reset counting reset_counting(self.health_checkers) # wait 120s sleep(constants.WAIT_PER_OP*4) for k, t in self.checker_threads.items(): log.debug(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.debug("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) # delete chaos meta_name = chaos_config.get('metadata', None).get('name', None) chaos_opt.delete_chaos_object(meta_name) log.debug("chaos deleted") for k, t in self.checker_threads.items(): log.debug(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP*2) reconnect(connections, self.host, self.port) # reset counting again reset_counting(self.health_checkers) # wait 300s (varies by feature) sleep(constants.WAIT_PER_OP*5) # assert statistic: all ops success again log.debug("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) # assert all expectations assert_expectations() log.debug("*********************Chaos Test Completed**********************")
def test_scale_proxy(self): """ target: test milvus operation after proxy expand method: 1.deploy 1 proxy replicas 2.milvus e2e test in parallel 3.expand proxy pod from 1 to 5 4.milvus e2e test 5.shrink proxy from 5 to 2 expected: 1.verify data consistent and func work """ # deploy milvus cluster with one proxy fail_count = 0 release_name = "scale-proxy" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' data_config = { 'metadata.namespace': constants.NAMESPACE, 'metadata.name': release_name, 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.proxy.replicas': 1, 'spec.components.dataNode.replicas': 2, 'spec.config.dataCoord.enableCompaction': True, 'spec.config.dataCoord.enableGarbageCollection': True } mic = MilvusOperator() mic.install(data_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1200): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: # log.warning(f'Deploy {release_name} timeout and ready to uninstall') # mic.uninstall(release_name, namespace=constants.NAMESPACE) raise BaseException(f'Milvus healthy timeout 1200s') try: c_name = cf.gen_unique_str(prefix) self.e2e_milvus_parallel(5, host, c_name) log.info('Milvus test before expand') # expand proxy replicas from 1 to 5 mic.upgrade(release_name, {'spec.components.proxy.replicas': 5}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") self.e2e_milvus_parallel(5, host, c_name) log.info('Milvus test after expand') # expand proxy replicas from 5 to 2 mic.upgrade(release_name, {'spec.components.proxy.replicas': 2}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") self.e2e_milvus_parallel(2, host, c_name) log.info('Milvus test after shrink') except Exception as e: log.error(str(e)) fail_count += 1 # raise Exception(str(e)) finally: log.info(f'Test finished with {fail_count} fail request') assert fail_count <= 1 label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)
def test_task_all(self, index_type, is_compacted, segment_status, is_vector_indexed, is_string_indexed, replica_number, is_deleted, data_size): """ before reinstall: create collection and insert data, load and search """ name = "" for k,v in locals().items(): if k in ["self", "name"]: continue name += f"_{k}_{v}" name = prefix + name self._connect() ms = MilvusSys() if len(ms.query_nodes) < replica_number: # this step is to make sure this testcase can run on standalone mode # or cluster mode which has only one querynode pytest.skip("skip test, not enough nodes") log.info(f"collection name: {name}, replica_number: {replica_number}, is_compacted: {is_compacted}," f"is_deleted: {is_deleted}, is_vector_indexed: {is_vector_indexed}, is_string_indexed: {is_string_indexed}," f"segment_status: {segment_status}, index_type: {index_type}") is_binary = True if "BIN" in index_type else False # params for search and query if is_binary: _, vectors_to_search = cf.gen_binary_vectors( default_nb, default_dim) default_search_field = ct.default_binary_vec_field_name else: vectors_to_search = cf.gen_vectors(default_nb, default_dim) default_search_field = ct.default_float_vec_field_name search_params = gen_search_param(index_type)[0] # init collection and insert with small size data without flush to get growing segment collection_w = self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000, is_flush=False, is_index=True, name=name)[0] # load for growing segment if replica_number >= 1: try: collection_w.release() except Exception as e: log.error( f"release collection failed: {e} maybe the collection is not loaded") collection_w.load(replica_number=replica_number) # delete data for growing segment delete_expr = f"{ct.default_int64_field_name} in [0,1,2,3,4,5,6,7,8,9]" if is_deleted == "is_deleted": collection_w.delete(expr=delete_expr) # search and query for growing segment if replica_number >= 1: collection_w.search(vectors_to_search[:default_nq], default_search_field, search_params, default_limit, default_search_exp, check_task=CheckTasks.check_search_results, check_items={"nq": default_nq, "limit": default_limit}) output_fields = [ct.default_int64_field_name] collection_w.query(default_term_expr, output_fields=output_fields, check_task=CheckTasks.check_query_not_empty) # skip subsequent operations when segment_status is set to only_growing if segment_status == "only_growing": pytest.skip( "already get growing segment, skip subsequent operations") # insert with flush multiple times to generate multiple sealed segment for i in range(2): self.init_collection_general(insert_data=True, is_binary=is_binary, nb=data_size, is_flush=False, is_index=True, name=name) collection_w.flush() # params for creating index if is_binary: default_index_field = ct.default_binary_vec_field_name else: default_index_field = ct.default_float_vec_field_name # create index for vector if is_vector_indexed == "is_vector_indexed": default_index_param = gen_index_param(index_type) collection_w.create_index(default_index_field, default_index_param) # create index for string if is_string_indexed == "is_string_indexed": default_string_index_params = {} default_string_index_name = "_default_string_idx" collection_w.create_index( default_string_field_name, default_string_index_params, index_name=default_string_index_name) # delete data for sealed segment delete_expr = f"{ct.default_int64_field_name} in [10,11,12,13,14,15,16,17,18,19]" if is_deleted == "is_deleted": collection_w.delete(expr=delete_expr) if is_compacted == "is_compacted": collection_w.compact() if segment_status == "all": self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000, is_flush=False, is_index=True, name=name) # reload after flush and creating index if replica_number > 0: collection_w.release() collection_w.load(replica_number=replica_number) # insert data to get growing segment if segment_status == "all": self.init_collection_general(insert_data=True, is_binary=is_binary, nb=3000, is_flush=False, is_index=True, name=name) # search and query for sealed and growing segment if replica_number > 0: collection_w.search(vectors_to_search[:default_nq], default_search_field, search_params, default_limit, default_search_exp, check_task=CheckTasks.check_search_results, check_items={"nq": default_nq, "limit": default_limit}) output_fields = [ct.default_int64_field_name] collection_w.query(default_term_expr, output_fields=output_fields, check_task=CheckTasks.check_query_not_empty)
def inner_wrapper(*args, **kwargs): try: return func(*args, **kwargs), True except Exception as e: log.error("[ClientRequest API Exception]%s: %s" % (str(func), str(e))) return e, False
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) self.checker_threads = cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report meta_name = chaos_config.get('metadata', None).get('name', None) dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("1st assert before chaos: ") f.write(f"{self.health_checkers}\n") # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2.1) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) for k, t in self.checker_threads.items(): log.info(f"10s later: Thread {k} is_alive(): {t.is_alive()}") # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={ Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:") f.write(f"{self.health_checkers}\n") # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") for k, t in self.checker_threads.items(): log.info(f"Thread {k} is_alive(): {t.is_alive()}") sleep(2) # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:") f.write(f"{self.health_checkers}\n") # assert all expectations assert_expectations() log.info( "*********************Chaos Test Completed**********************")
def test_chaos(self, chaos_yaml): # start the monitor threads to check the milvus ops log.info("*********************Chaos Test Start**********************") log.info(connections.get_connection_addr('default')) cc.start_monitor_threads(self.health_checkers) # parse chaos object chaos_config = cc.gen_experiment_config(chaos_yaml) meta_name = chaos_config.get('metadata', None).get('name', None) release_name = meta_name chaos_config_str = json.dumps(chaos_config) chaos_config_str = chaos_config_str.replace("milvus-chaos", release_name) chaos_config = json.loads(chaos_config_str) self._chaos_config = chaos_config # cache the chaos config for tear down log.info(f"chaos_config: {chaos_config}") # parse the test expectations in testcases.yaml if self.parser_testcase_config(chaos_yaml, chaos_config) is False: log.error("Fail to get the testcase info in testcases.yaml") assert False # init report dir_name = "./reports" file_name = f"./reports/{meta_name}.log" if not os.path.exists(dir_name): os.makedirs(dir_name) # wait 20s sleep(constants.WAIT_PER_OP * 2) # assert statistic:all ops 100% succ log.info("******1st assert before chaos: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: ts = time.strftime("%Y-%m-%d %H:%M:%S") f.write(f"{meta_name}-{ts}\n") f.write("1st assert before chaos:\n") f.write(record_results(self.health_checkers)) # apply chaos object chaos_res = CusResource(kind=chaos_config['kind'], group=constants.CHAOS_GROUP, version=constants.CHAOS_VERSION, namespace=constants.CHAOS_NAMESPACE) chaos_res.create(chaos_config) log.info("chaos injected") log.info(f"chaos information: {chaos_res.get(meta_name)}") sleep(constants.WAIT_PER_OP * 2) # reset counting cc.reset_counting(self.health_checkers) # wait 40s sleep(constants.CHAOS_DURATION) log.info(f'Alive threads: {threading.enumerate()}') # assert statistic log.info("******2nd assert after chaos injected: ") assert_statistic(self.health_checkers, expectations={Op.create: self.expect_create, Op.insert: self.expect_insert, Op.flush: self.expect_flush, Op.index: self.expect_index, Op.search: self.expect_search, Op.query: self.expect_query }) with open(file_name, "a+") as f: f.write("2nd assert after chaos injected:\n") f.write(record_results(self.health_checkers)) # delete chaos chaos_res.delete(meta_name) log.info("chaos deleted") log.info(f'Alive threads: {threading.enumerate()}') sleep(2) # wait all pods ready log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label app.kubernetes.io/instance={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"app.kubernetes.io/instance={meta_name}") log.info(f"wait for pods in namespace {constants.CHAOS_NAMESPACE} with label release={meta_name}") wait_pods_ready(constants.CHAOS_NAMESPACE, f"release={meta_name}") log.info("all pods are ready") # reconnect if needed sleep(constants.WAIT_PER_OP * 2) cc.reconnect(connections, alias='default') # reset counting again cc.reset_counting(self.health_checkers) # wait 50s (varies by feature) sleep(constants.WAIT_PER_OP * 5) # assert statistic: all ops success again log.info("******3rd assert after chaos deleted: ") assert_statistic(self.health_checkers) with open(file_name, "a+") as f: f.write("3rd assert after chaos deleted:\n") f.write(record_results(self.health_checkers)) # assert all expectations assert_expectations() log.info("*********************Chaos Test Completed**********************")
def test_scale_data_node(self): """ target: test scale dataNode method: 1.deploy milvus cluster with 2 dataNode 2.create collection with shards_num=5 3.continuously insert new data (daemon thread) 4.expand dataNode from 2 to 5 5.create new collection with shards_num=2 6.continuously insert new collection new data (daemon thread) 7.shrink dataNode from 5 to 3 expected: Verify milvus remains healthy, Insert and flush successfully during scale Average dataNode memory usage """ release_name = "scale-data" image_tag = get_latest_tag() image = f'{constants.IMAGE_REPOSITORY}:{image_tag}' data_config = { 'metadata.namespace': constants.NAMESPACE, 'spec.mode': 'cluster', 'metadata.name': release_name, 'spec.components.image': image, 'spec.components.proxy.serviceType': 'LoadBalancer', 'spec.components.dataNode.replicas': 2, 'spec.config.common.retentionDuration': 60 } mic = MilvusOperator() mic.install(data_config) if mic.wait_for_healthy(release_name, constants.NAMESPACE, timeout=1800): host = mic.endpoint(release_name, constants.NAMESPACE).split(':')[0] else: raise MilvusException(message=f'Milvus healthy timeout 1800s') try: # connect connections.add_connection(default={"host": host, "port": 19530}) connections.connect(alias='default') # create c_name = cf.gen_unique_str("scale_data") collection_w = ApiCollectionWrapper() collection_w.init_collection( name=c_name, schema=cf.gen_default_collection_schema(), shards_num=4) tmp_nb = 10000 @counter def do_insert(): """ do insert and flush """ insert_res, is_succ = collection_w.insert( cf.gen_default_dataframe_data(tmp_nb)) log.debug(collection_w.num_entities) return insert_res, is_succ def loop_insert(): """ loop do insert """ while True: do_insert() threading.Thread(target=loop_insert, args=(), daemon=True).start() # scale dataNode to 5 mic.upgrade(release_name, {'spec.components.dataNode.replicas': 5}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.debug("Expand dataNode test finished") # create new collection and insert new_c_name = cf.gen_unique_str("scale_data") collection_w_new = ApiCollectionWrapper() collection_w_new.init_collection( name=new_c_name, schema=cf.gen_default_collection_schema(), shards_num=3) @counter def do_new_insert(): """ do new insert """ insert_res, is_succ = collection_w_new.insert( cf.gen_default_dataframe_data(tmp_nb)) log.debug(collection_w_new.num_entities) return insert_res, is_succ def loop_new_insert(): """ loop new insert """ while True: do_new_insert() threading.Thread(target=loop_new_insert, args=(), daemon=True).start() # scale dataNode to 3 mic.upgrade(release_name, {'spec.components.dataNode.replicas': 3}, constants.NAMESPACE) mic.wait_for_healthy(release_name, constants.NAMESPACE) wait_pods_ready(constants.NAMESPACE, f"app.kubernetes.io/instance={release_name}") log.debug(collection_w.num_entities) time.sleep(300) scale_common.check_succ_rate(do_insert) scale_common.check_succ_rate(do_new_insert) log.debug("Shrink dataNode test finished") except Exception as e: log.error(str(e)) # raise Exception(str(e)) finally: label = f"app.kubernetes.io/instance={release_name}" log.info('Start to export milvus pod logs') read_pod_log(namespace=constants.NAMESPACE, label_selector=label, release_name=release_name) mic.uninstall(release_name, namespace=constants.NAMESPACE)