def alias_cases(): teardown() A, B = setup() latest_coll_alias = "latest_collection" utility.create_alias(A.name, latest_coll_alias) alias_collection = Collection(latest_coll_alias) assert alias_collection.description == A.description utility.alter_alias(B.name, latest_coll_alias) alias_collection = Collection(latest_coll_alias) assert alias_collection.description == B.description utility.drop_alias(latest_coll_alias) try: alias_collection = Collection(latest_coll_alias) except BaseException as e: print( f" - Alias [{latest_coll_alias}] dropped, cannot get collection from it. Error msg: {e}" ) finally: teardown()
def test_collection_by_DataFrame(self): from pymilvus import Collection from pymilvus import FieldSchema, CollectionSchema from pymilvus import DataType coll_name = gen_collection_name() fields = [ FieldSchema("int64", DataType.INT64), FieldSchema("float", DataType.FLOAT), FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128) ] prefix = "pymilvus.client.grpc_handler.GrpcHandler" collection_schema = CollectionSchema(fields, primary_field="int64") with mock.patch(f"{prefix}.__init__", return_value=None): with mock.patch(f"{prefix}._wait_for_channel_ready", return_value=None): connections.connect() with mock.patch(f"{prefix}.create_collection", return_value=None): with mock.patch(f"{prefix}.has_collection", return_value=False): collection = Collection(name=coll_name, schema=collection_schema) with mock.patch(f"{prefix}.create_collection", return_value=None): with mock.patch(f"{prefix}.has_collection", return_value=True): with mock.patch(f"{prefix}.describe_collection", return_value=collection_schema.to_dict()): collection = Collection(name=coll_name) with mock.patch(f"{prefix}.drop_collection", return_value=None): with mock.patch(f"{prefix}.describe_index", return_value=None): collection.drop()
def test_collection_by_DataFrame(self): from pymilvus import Collection, connections from pymilvus import FieldSchema, CollectionSchema from pymilvus import DataType coll_name = gen_collection_name() fields = [ FieldSchema("int64", DataType.INT64), FieldSchema("float", DataType.FLOAT), FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128) ] collection_schema = CollectionSchema(fields, primary_field="int64") with mock.patch("pymilvus.Milvus.__init__", return_value=None): connections.connect() with mock.patch("pymilvus.Milvus.create_collection", return_value=None): with mock.patch("pymilvus.Milvus.has_collection", return_value=False): collection = Collection(name=coll_name, schema=collection_schema) with mock.patch("pymilvus.Milvus.create_collection", return_value=None): with mock.patch("pymilvus.Milvus.has_collection", return_value=True): with mock.patch("pymilvus.Milvus.describe_collection", return_value=collection_schema.to_dict()): collection = Collection(name=coll_name) with mock.patch("pymilvus.Milvus.drop_collection", return_value=None): with mock.patch("pymilvus.Milvus.describe_index", return_value=None): collection.drop()
def test_collection_only_name(): name = gen_unique_str() collection_temp = Collection(name=name, schema=gen_default_fields()) collection = Collection(name=name) data = gen_float_data(default_nb) collection.insert(data) collection.load() assert collection.is_empty is False assert collection.num_entities == default_nb collection.drop()
def test_specify_primary_key(): data = gen_float_data(default_nb) collection = Collection(name=gen_unique_str(), data=data, schema=gen_default_fields_with_primary_key_1()) for index_param in gen_simple_index(): collection.create_index(field_name=default_float_vec_field_name, index_params=index_param) assert len(collection.indexes) != 0 collection.drop() collection2 = Collection(name=gen_unique_str(), data=data, schema=gen_default_fields_with_primary_key_2()) for index_param in gen_simple_index(): collection2.create_index(field_name=default_float_vec_field_name, index_params=index_param) assert len(collection2.indexes) != 0 collection2.drop()
def test_partition(): connections.connect(alias="default") print("create collection") collection = Collection(name=gen_unique_str(), schema=gen_default_fields()) print("create partition") partition = Partition(collection, name=gen_unique_str()) print(list_collections()) assert has_partition(collection.name, partition.name) is True data = gen_data(default_nb) print("insert data to partition") partition.insert(data) assert partition.is_empty is False assert partition.num_entities == default_nb print("load partition") partition.load() topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} print("search partition") res = partition.search(data[2][-2:], "float_vector", search_params, topK, "count > 100") for hits in res: for hit in hits: print(hit) print("release partition") partition.release() print("drop partition") partition.drop() print("drop collection") collection.drop()
def create_collection(name, id_field, vector_field, attr1_name, attr2_name): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) # TODO: remove dim. field3 = FieldSchema(name=attr1_name, dtype=DataType.INT64, description="attr1", is_primary=False, dim=_DIM) field4 = FieldSchema(name=attr2_name, dtype=DataType.DOUBLE, description="attr2", is_primary=False, dim=_DIM) schema = CollectionSchema(fields=[field1, field2, field3, field4], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def test_create_index_binary_vector(): collection = Collection(name=gen_unique_str(), schema=gen_binary_schema()) data = gen_binary_data(default_nb) collection.insert(data) collection.create_index(field_name=default_binary_vec_field_name, index_params=default_binary_index) assert len(collection.indexes) != 0 collection.drop()
def test_create_index_float_vector(): data = gen_float_data(default_nb) collection = Collection(name=gen_unique_str(), data=data, schema=gen_default_fields()) for index_param in gen_simple_index(): collection.create_index(field_name=default_float_vec_field_name, index_params=index_param) assert len(collection.indexes) != 0 collection.drop()
def create_collections_and_insert_data(): import random import time dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nList collections...") print(list_collections()) for col_name in all_index_types: print(f"\nCreate collection...") collection = Collection(name=col_name, schema=default_schema) # insert data nb = 3000 vectors = [[i / nb for _ in range(dim)] for i in range(nb)] collection.insert( [[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"collection name: {col_name}") print("Get collection entities") start_time = time.time() print(f"collection entities: {collection.num_entities}") end_time = time.time() print("Get collection entities time = %.4fs" % (end_time - start_time)) print(f"\nList collections...") print(list_collections())
def test_collection_with_dataframe(): data = gen_dataframe(default_nb) collection, _ = Collection.construct_from_dataframe(name=gen_unique_str(), dataframe=data, primary_field="int64") collection.load() assert collection.is_empty is False assert collection.num_entities == default_nb collection.drop()
def get_collections(): print(f"\nList collections...") col_list = list_collections() print(f"collections_nums: {len(col_list)}") # list entities if collections for name in col_list: c = Collection(name=name) print(f"{name}: {c.num_entities}")
def get_collections(prefix): print(f"\nList collections...") col_list = filter_collections_by_prefix(prefix) print(f"collections_nums: {len(col_list)}") # list entities if collections for name in col_list: c = Collection(name=name) print(f"{name}: {c.num_entities}") return col_list
def load_and_search(prefix, replicas=1): print("search data starts") col_list = get_collections(prefix) for col_name in col_list: c = Collection(name=col_name) print(f"collection name: {col_name}") print("release collection") c.release() print("load collection") t0 = time.time() if replicas == 1: c.load() if replicas > 1: c.load(replica_number=replicas) print(c.get_replicas()) print(f"load time: {time.time() - t0:.4f}") topK = 5 vectors = [[1.0 for _ in range(128)] for _ in range(3000)] index_name = col_name.replace(prefix, "") search_params = gen_search_param(index_name)[0] print(search_params) # search_params = {"metric_type": "L2", "params": {"nprobe": 10}} start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = c.search(vectors[:1], "float_vector", search_params, topK, "count > 500", output_fields=["count", "random_value"], timeout=120) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) ids = hits.ids print(ids) print("search latency: %.4fs" % (end_time - start_time)) t0 = time.time() expr = "count in [2,4,6,8]" output_fields = ["count", "random_value"] res = c.query(expr, output_fields, timeout=20) sorted_res = sorted(res, key=lambda k: k['count']) for r in sorted_res: print(r) t1 = time.time() print("query latency: %.4fs" % (t1 - t0)) # c.release() print("###########") print("search data ends")
def get_collections(prefix, check=False): print("\nList collections...") col_list = filter_collections_by_prefix(prefix) print(f"collections_nums: {len(col_list)}") # list entities if collections for name in col_list: c = Collection(name=name) num_entities = c.num_entities print(f"{name}: {num_entities}") if check: assert num_entities >= 3000 return col_list
def create_collection(name, id_field, vector_field): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def create_collections_and_insert_data(prefix, flush=True, count=3000, collection_cnt=11): import random dim = 128 nb = count // 10 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") for index_name in all_index_types[:collection_cnt]: print("\nCreate collection...") col_name = prefix + index_name collection = Collection(name=col_name, schema=default_schema) print(f"collection name: {col_name}") print(f"begin insert, count: {count} nb: {nb}") times = int(count // nb) total_time = 0.0 vectors = [[random.random() for _ in range(dim)] for _ in range(count)] for j in range(times): start_time = time.time() collection.insert( [[i for i in range(nb * j, nb * j + nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors[nb * j:nb * j + nb]]) end_time = time.time() print( f"[{j+1}/{times}] insert {nb} data, time: {end_time - start_time:.4f}" ) total_time += end_time - start_time print(f"end insert, time: {total_time:.4f}") if flush: print("Get collection entities") start_time = time.time() print(f"collection entities: {collection.num_entities}") end_time = time.time() print("Get collection entities time = %.4fs" % (end_time - start_time)) print("\nList collections...") print(get_collections(prefix))
def create_collection(name, id_field, vector_field, str_field): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) field3 = FieldSchema(name=str_field, dtype=DataType.VARCHAR, description="string", max_len_per_row=_MAX_LEN_PER_ROW, is_primary=False) schema = CollectionSchema(fields=[field1, field2, field3], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def create_index(): # create index default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } col_list = list_collections() print(f"\nCreate index...") for name in col_list: c = Collection(name=name) print(name) print(c) index = copy.deepcopy(default_index) index["index_type"] = name index["params"] = index_params_map[name] if name in ["BIN_FLAT", "BIN_IVF_FLAT"]: index["metric_type"] = "HAMMING" c.create_index(field_name="float_vector", index_params=index)
def load_and_search(): print("search data starts") col_list = list_collections() for name in col_list: c = Collection(name=name) print(f"collection name: {name}") c.load() topK = 5 vectors = [[0.0 for _ in range(128)] for _ in range(3000)] index_type = name search_params = gen_search_param(index_type)[0] print(search_params) # search_params = {"metric_type": "L2", "params": {"nprobe": 10}} import time start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = c.search(vectors[:1], "float_vector", search_params, topK, "count > 500", output_fields=["count", "random_value"], timeout=20) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) ids = hits.ids print(ids) print("###########") print("search latency = %.4fs" % (end_time - start_time)) c.release() print("search data ends")
def create_index(prefix): # create index default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } col_list = get_collections(prefix) print("\nCreate index...") for col_name in col_list: c = Collection(name=col_name) index_name = col_name.replace(prefix, "") print(index_name) print(c) index = copy.deepcopy(default_index) index["index_type"] = index_name index["params"] = index_params_map[index_name] if index_name in ["BIN_FLAT", "BIN_IVF_FLAT"]: index["metric_type"] = "HAMMING" t0 = time.time() c.create_index(field_name="float_vector", index_params=index) print(f"create index time: {time.time() - t0:.4f}")
def hello_milvus(collection_name): import time # create collection dim = 128 default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length_per_row=65535), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") if utility.has_collection(collection_name): print("collection is exist") collection = Collection(name=collection_name) default_schema = collection.schema dim = [ field.params['dim'] for field in default_schema.fields if field.dtype in [101, 102] ][0] print(f"\nCreate collection...") collection = Collection(name=collection_name, schema=default_schema) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [np.float32(i) for i in range(nb)], [str(i) for i in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0:.4f} seconds") # create index and load table default_index = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0:.4f} seconds") print(f"\nload collection...") t0 = time.time() collection.load() t1 = time.time() print(f"\nload collection cost {t1 - t0:.4f} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} t0 = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "int64 > 100", output_fields=["int64", "float"], timeout=TIMEOUT) t1 = time.time() print(f"search cost {t1 - t0:.4f} seconds") # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("float")) # query expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r) collection.release()
print(list_collections()) # Create a collection named 'demo_film_tutorial' print(f"\nCreate collection...") field1 = FieldSchema(name="release_year", dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, description="float vector", dim=8, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") collection = Collection(name='demo_film_tutorial', data=None, schema=schema) # List all collection names print(f"\nList collections...") print(list_collections()) print(f"\nGet collection name, schema and description...") print(collection.name) print(collection.schema) print(collection.description) # List all partition names in demo collection print(f"\nList partitions...") print(collection.partitions) # Create a partition named 'American'
def test_exist_collection(name): assert utility.has_collection(name) is True collection = Collection(name) collection.drop()
def test_create_collection(): name = gen_unique_str() collection = Collection(name=name, schema=gen_default_fields()) assert collection.is_empty is True assert collection.num_entities == 0 return name
def index(self, name, field_name, collection_name, schema, get_simple_index): connections.connect() collection = Collection(collection_name, schema=schema) return Index(collection, field_name, get_simple_index)
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0} seconds") # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0} seconds") print(f"\nload collection...") t0 = time.time() collection.load() t1 = time.time() print(f"\nload collection cost {t1 - t0} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # query expr = "count in [2,4,6,8]" output_fields = ["count", "random_value"] res = collection.query(expr, output_fields) sorted_res = sorted(res, key=lambda k: k['count']) for r in sorted_res: print(r)
def hello_milvus(): # create connection connections.connect() print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"\nGet collection entities...") print(collection.num_entities) # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") collection.create_index(field_name="float_vector", index_params=default_index) print(f"\nload collection...") collection.load() # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} import time start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # drop collection collection.drop()
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=65535), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [np.float32(i) for i in range(nb)], [str(i) for i in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0:.4f} seconds") # create index and load table default_index = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0:.4f} seconds") print("\nGet replicas number") try: replicas_info = collection.get_replicas() replica_number = len(replicas_info.groups) print(f"\nReplicas number is {replica_number}") except Exception as e: print(str(e)) replica_number = 1 print(f"\nload collection...") t0 = time.time() collection.load(replica_number=replica_number) t1 = time.time() print(f"\nload collection cost {t1 - t0:.4f} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} t0 = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "int64 > 100", output_fields=["int64", "float"], timeout=TIMEOUT) t1 = time.time() print(f"search cost {t1 - t0:.4f} seconds") # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("float")) # query expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r)
def test_construct_from_dataframe(self): assert type( Collection.construct_from_dataframe( gen_collection_name(), gen_pd_data(default_nb), primary_field="int64")[0]) is Collection