def create_collection(name, id_field, vector_field, attr1_name, attr2_name): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) # TODO: remove dim. field3 = FieldSchema(name=attr1_name, dtype=DataType.INT64, description="attr1", is_primary=False, dim=_DIM) field4 = FieldSchema(name=attr2_name, dtype=DataType.DOUBLE, description="attr2", is_primary=False, dim=_DIM) schema = CollectionSchema(fields=[field1, field2, field3, field4], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def test_collection_by_DataFrame(self): from pymilvus import Collection, connections from pymilvus import FieldSchema, CollectionSchema from pymilvus import DataType coll_name = gen_collection_name() fields = [ FieldSchema("int64", DataType.INT64), FieldSchema("float", DataType.FLOAT), FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128) ] collection_schema = CollectionSchema(fields, primary_field="int64") with mock.patch("pymilvus.Milvus.__init__", return_value=None): connections.connect() with mock.patch("pymilvus.Milvus.create_collection", return_value=None): with mock.patch("pymilvus.Milvus.has_collection", return_value=False): collection = Collection(name=coll_name, schema=collection_schema) with mock.patch("pymilvus.Milvus.create_collection", return_value=None): with mock.patch("pymilvus.Milvus.has_collection", return_value=True): with mock.patch("pymilvus.Milvus.describe_collection", return_value=collection_schema.to_dict()): collection = Collection(name=coll_name) with mock.patch("pymilvus.Milvus.drop_collection", return_value=None): with mock.patch("pymilvus.Milvus.describe_index", return_value=None): collection.drop()
def create_collections_and_insert_data(): import random import time dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nList collections...") print(list_collections()) for col_name in all_index_types: print(f"\nCreate collection...") collection = Collection(name=col_name, schema=default_schema) # insert data nb = 3000 vectors = [[i / nb for _ in range(dim)] for i in range(nb)] collection.insert( [[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"collection name: {col_name}") print("Get collection entities") start_time = time.time() print(f"collection entities: {collection.num_entities}") end_time = time.time() print("Get collection entities time = %.4fs" % (end_time - start_time)) print(f"\nList collections...") print(list_collections())
def test_collection_by_DataFrame(self): from pymilvus import Collection from pymilvus import FieldSchema, CollectionSchema from pymilvus import DataType coll_name = gen_collection_name() fields = [ FieldSchema("int64", DataType.INT64), FieldSchema("float", DataType.FLOAT), FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128) ] prefix = "pymilvus.client.grpc_handler.GrpcHandler" collection_schema = CollectionSchema(fields, primary_field="int64") with mock.patch(f"{prefix}.__init__", return_value=None): with mock.patch(f"{prefix}._wait_for_channel_ready", return_value=None): connections.connect() with mock.patch(f"{prefix}.create_collection", return_value=None): with mock.patch(f"{prefix}.has_collection", return_value=False): collection = Collection(name=coll_name, schema=collection_schema) with mock.patch(f"{prefix}.create_collection", return_value=None): with mock.patch(f"{prefix}.has_collection", return_value=True): with mock.patch(f"{prefix}.describe_collection", return_value=collection_schema.to_dict()): collection = Collection(name=coll_name) with mock.patch(f"{prefix}.drop_collection", return_value=None): with mock.patch(f"{prefix}.describe_index", return_value=None): collection.drop()
def gen_default_fields_with_primary_key_2(): default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64), FieldSchema(name="double", dtype=DataType.DOUBLE), FieldSchema(name=default_float_vec_field_name, dtype=DataType.FLOAT_VECTOR, dim=default_dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection", primary_field="int64") return default_schema
def gen_binary_schema(): binary_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="double", dtype=DataType.DOUBLE), FieldSchema(name=default_binary_vec_field_name, dtype=DataType.BINARY_VECTOR, dim=default_dim) ] default_schema = CollectionSchema(fields=binary_fields, description="test collection") return default_schema
def test_cmp(self, raw_dict_binary_vector): import copy field1 = FieldSchema.construct_from_dict(raw_dict_binary_vector) field2 = FieldSchema.construct_from_dict(raw_dict_binary_vector) assert field1 == field2 dict1 = copy.deepcopy(raw_dict_binary_vector) dict1["name"] = dict1["name"] + "_" field3 = FieldSchema.construct_from_dict(dict1) assert field1 != field3
def gen_default_fields(auto_id=True): default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name=default_float_vec_field_name, dtype=DataType.FLOAT_VECTOR, dim=default_dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection", segment_row_limit=default_segment_row_limit, auto_id=False) return default_schema
def gen_schema(): from pymilvus import CollectionSchema, FieldSchema fields = [ FieldSchema(gen_field_name(), DataType.INT64, is_primary=True, auto_id=False), FieldSchema(gen_field_name(), DataType.FLOAT), FieldSchema(gen_field_name(), DataType.FLOAT_VECTOR, dim=default_dim) ] collection_schema = CollectionSchema(fields) return collection_schema
def gen_default_fields(description="test collection"): default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="double", dtype=DataType.DOUBLE), FieldSchema(name=default_float_vec_field_name, dtype=DataType.FLOAT_VECTOR, dim=default_dim) ] default_schema = CollectionSchema(fields=default_fields, description=description) return default_schema
def test_to_dict(self, raw_dict_norm, raw_dict_float_vector, raw_dict_binary_vector): fields = [] dicts = [raw_dict_norm, raw_dict_float_vector, raw_dict_binary_vector] fields.append(FieldSchema.construct_from_dict(raw_dict_norm)) fields.append(FieldSchema.construct_from_dict(raw_dict_float_vector)) fields.append(FieldSchema.construct_from_dict(raw_dict_binary_vector)) for i, f in enumerate(fields): target = f.to_dict() assert target == dicts[i] assert target is not dicts[i]
def create_collection(name, id_field, vector_field): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def test_constructor_from_float_dict(self, raw_dict_float_vector): field = FieldSchema.construct_from_dict(raw_dict_float_vector) assert field.dtype == DataType.FLOAT_VECTOR assert field.description == raw_dict_float_vector['description'] assert field.is_primary == False assert field.name == raw_dict_float_vector['name'] assert field.dim == raw_dict_float_vector['params']['dim']
def test_constructor_from_norm_dict(self, raw_dict_norm): field = FieldSchema.construct_from_dict(raw_dict_norm) assert field.dtype == DataType.INT64 assert field.description == raw_dict_norm['description'] assert field.is_primary == False assert field.name == raw_dict_norm['name'] assert field.dim is None assert field.dummy is None
def create_collections_and_insert_data(prefix, flush=True, count=3000, collection_cnt=11): import random dim = 128 nb = count // 10 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") for index_name in all_index_types[:collection_cnt]: print("\nCreate collection...") col_name = prefix + index_name collection = Collection(name=col_name, schema=default_schema) print(f"collection name: {col_name}") print(f"begin insert, count: {count} nb: {nb}") times = int(count // nb) total_time = 0.0 vectors = [[random.random() for _ in range(dim)] for _ in range(count)] for j in range(times): start_time = time.time() collection.insert( [[i for i in range(nb * j, nb * j + nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors[nb * j:nb * j + nb]]) end_time = time.time() print( f"[{j+1}/{times}] insert {nb} data, time: {end_time - start_time:.4f}" ) total_time += end_time - start_time print(f"end insert, time: {total_time:.4f}") if flush: print("Get collection entities") start_time = time.time() print(f"collection entities: {collection.num_entities}") end_time = time.time() print("Get collection entities time = %.4fs" % (end_time - start_time)) print("\nList collections...") print(get_collections(prefix))
def create_collection(name, id_field, vector_field, str_field): field1 = FieldSchema(name=id_field, dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name=vector_field, dtype=DataType.FLOAT_VECTOR, description="float vector", dim=_DIM, is_primary=False) field3 = FieldSchema(name=str_field, dtype=DataType.VARCHAR, description="string", max_len_per_row=_MAX_LEN_PER_ROW, is_primary=False) schema = CollectionSchema(fields=[field1, field2, field3], description="collection description") collection = Collection(name=name, data=None, schema=schema) print("\ncollection created:", name) return collection
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=65535), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [np.float32(i) for i in range(nb)], [str(i) for i in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0:.4f} seconds") # create index and load table default_index = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0:.4f} seconds") print("\nGet replicas number") try: replicas_info = collection.get_replicas() replica_number = len(replicas_info.groups) print(f"\nReplicas number is {replica_number}") except Exception as e: print(str(e)) replica_number = 1 print(f"\nload collection...") t0 = time.time() collection.load(replica_number=replica_number) t1 = time.time() print(f"\nload collection cost {t1 - t0:.4f} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} t0 = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "int64 > 100", output_fields=["int64", "float"], timeout=TIMEOUT) t1 = time.time() print(f"search cost {t1 - t0:.4f} seconds") # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("float")) # query expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r)
################################################################################# # 2. create collection # We're going to create a collection with 3 fields. # +-+------------+------------+------------------+------------------------------+ # | | field name | field type | other attributes | field description | # +-+------------+------------+------------------+------------------------------+ # |1| "pk" | Int64 | is_primary=True | "primary field" | # | | | | auto_id=False | | # +-+------------+------------+------------------+------------------------------+ # |2| "random" | Double | | "a double field" | # +-+------------+------------+------------------+------------------------------+ # |3|"embeddings"| FloatVector| dim=8 | "float vector with dim 8" | # +-+------------+------------+------------------+------------------------------+ fields = [ FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=False), FieldSchema(name="random", dtype=DataType.DOUBLE), FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim) ] schema = CollectionSchema( fields, "hello_milvus is the simplest demo to introduce the APIs") print(fmt.format("Create collection `hello_milvus`")) hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong") ################################################################################ # 3. insert data # We are going to insert 3000 rows of data into `hello_milvus` # Data to be inserted must be organized in fields.
def hello_milvus(host="127.0.0.1"): import time # create connection connections.connect(host=host, port="19530") print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0} seconds") # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0} seconds") print(f"\nload collection...") t0 = time.time() collection.load() t1 = time.time() print(f"\nload collection cost {t1 - t0} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # query expr = "count in [2,4,6,8]" output_fields = ["count", "random_value"] res = collection.query(expr, output_fields) sorted_res = sorted(res, key=lambda k: k['count']) for r in sorted_res: print(r)
def hello_milvus(collection_name): import time # create collection dim = 128 default_fields = [ FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True), FieldSchema(name="float", dtype=DataType.FLOAT), FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length_per_row=65535), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") if utility.has_collection(collection_name): print("collection is exist") collection = Collection(name=collection_name) default_schema = collection.schema dim = [ field.params['dim'] for field in default_schema.fields if field.dtype in [101, 102] ][0] print(f"\nCreate collection...") collection = Collection(name=collection_name, schema=default_schema) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] t0 = time.time() collection.insert([[i for i in range(nb)], [np.float32(i) for i in range(nb)], [str(i) for i in range(nb)], vectors]) t1 = time.time() print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds") t0 = time.time() print(f"\nGet collection entities...") print(collection.num_entities) t1 = time.time() print(f"\nGet collection entities cost {t1 - t0:.4f} seconds") # create index and load table default_index = { "index_type": "IVF_SQ8", "metric_type": "L2", "params": { "nlist": 64 } } print(f"\nCreate index...") t0 = time.time() collection.create_index(field_name="float_vector", index_params=default_index) t1 = time.time() print(f"\nCreate index cost {t1 - t0:.4f} seconds") print(f"\nload collection...") t0 = time.time() collection.load() t1 = time.time() print(f"\nload collection cost {t1 - t0:.4f} seconds") # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} t0 = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "int64 > 100", output_fields=["int64", "float"], timeout=TIMEOUT) t1 = time.time() print(f"search cost {t1 - t0:.4f} seconds") # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("float")) # query expr = "int64 in [2,4,6,8]" output_fields = ["int64", "float"] res = collection.query(expr, output_fields, timeout=TIMEOUT) sorted_res = sorted(res, key=lambda k: k['int64']) for r in sorted_res: print(r) collection.release()
from pymilvus import (connections, list_collections, FieldSchema, CollectionSchema, DataType, Collection, Partition, utility) # configure milvus hostname and port print(f"\nCreate connection...") connections.connect() # List all collection names print(f"\nList collections...") print(list_collections()) # Create a collection named 'demo_film_tutorial' print(f"\nCreate collection...") field1 = FieldSchema(name="release_year", dtype=DataType.INT64, description="int64", is_primary=True) field2 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, description="float vector", dim=8, is_primary=False) schema = CollectionSchema(fields=[field1, field2], description="collection description") collection = Collection(name='demo_film_tutorial', data=None, schema=schema) # List all collection names print(f"\nList collections...") print(list_collections()) print(f"\nGet collection name, schema and description...")
def hello_milvus(): # create connection connections.connect() print(f"\nList collections...") print(list_collections()) # create collection dim = 128 default_fields = [ FieldSchema(name="count", dtype=DataType.INT64, is_primary=True), FieldSchema(name="random_value", dtype=DataType.DOUBLE), FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim) ] default_schema = CollectionSchema(fields=default_fields, description="test collection") print(f"\nCreate collection...") collection = Collection(name="hello_milvus", schema=default_schema) print(f"\nList collections...") print(list_collections()) # insert data nb = 3000 vectors = [[random.random() for _ in range(dim)] for _ in range(nb)] collection.insert([[i for i in range(nb)], [float(random.randrange(-20, -10)) for _ in range(nb)], vectors]) print(f"\nGet collection entities...") print(collection.num_entities) # create index and load table default_index = { "index_type": "IVF_FLAT", "params": { "nlist": 128 }, "metric_type": "L2" } print(f"\nCreate index...") collection.create_index(field_name="float_vector", index_params=default_index) print(f"\nload collection...") collection.load() # load and search topK = 5 search_params = {"metric_type": "L2", "params": {"nprobe": 10}} import time start_time = time.time() print(f"\nSearch...") # define output_fields of search result res = collection.search(vectors[-2:], "float_vector", search_params, topK, "count > 100", output_fields=["count", "random_value"]) end_time = time.time() # show result for hits in res: for hit in hits: # Get value of the random value field for search result print(hit, hit.entity.get("random_value")) print("search latency = %.4fs" % (end_time - start_time)) # drop collection collection.drop()