def create_collection(name, id_field, vector_field, attr1_name, attr2_name):
    field1 = FieldSchema(name=id_field,
                         dtype=DataType.INT64,
                         description="int64",
                         is_primary=True)
    field2 = FieldSchema(name=vector_field,
                         dtype=DataType.FLOAT_VECTOR,
                         description="float vector",
                         dim=_DIM,
                         is_primary=False)
    # TODO: remove dim.
    field3 = FieldSchema(name=attr1_name,
                         dtype=DataType.INT64,
                         description="attr1",
                         is_primary=False,
                         dim=_DIM)
    field4 = FieldSchema(name=attr2_name,
                         dtype=DataType.DOUBLE,
                         description="attr2",
                         is_primary=False,
                         dim=_DIM)
    schema = CollectionSchema(fields=[field1, field2, field3, field4],
                              description="collection description")
    collection = Collection(name=name, data=None, schema=schema)
    print("\ncollection created:", name)
    return collection
Пример #2
0
    def test_collection_by_DataFrame(self):
        from pymilvus import Collection, connections
        from pymilvus import FieldSchema, CollectionSchema
        from pymilvus import DataType
        coll_name = gen_collection_name()
        fields = [
            FieldSchema("int64", DataType.INT64),
            FieldSchema("float", DataType.FLOAT),
            FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128)
        ]

        collection_schema = CollectionSchema(fields, primary_field="int64")
        with mock.patch("pymilvus.Milvus.__init__", return_value=None):
            connections.connect()

        with mock.patch("pymilvus.Milvus.create_collection",
                        return_value=None):
            with mock.patch("pymilvus.Milvus.has_collection",
                            return_value=False):
                collection = Collection(name=coll_name,
                                        schema=collection_schema)

        with mock.patch("pymilvus.Milvus.create_collection",
                        return_value=None):
            with mock.patch("pymilvus.Milvus.has_collection",
                            return_value=True):
                with mock.patch("pymilvus.Milvus.describe_collection",
                                return_value=collection_schema.to_dict()):
                    collection = Collection(name=coll_name)

        with mock.patch("pymilvus.Milvus.drop_collection", return_value=None):
            with mock.patch("pymilvus.Milvus.describe_index",
                            return_value=None):
                collection.drop()
Пример #3
0
def create_collections_and_insert_data():
    import random
    import time
    dim = 128
    default_fields = [
        FieldSchema(name="count", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="random_value", dtype=DataType.DOUBLE),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")
    print(f"\nList collections...")
    print(list_collections())
    for col_name in all_index_types:
        print(f"\nCreate collection...")
        collection = Collection(name=col_name, schema=default_schema)
        #  insert data
        nb = 3000
        vectors = [[i / nb for _ in range(dim)] for i in range(nb)]
        collection.insert(
            [[i for i in range(nb)],
             [float(random.randrange(-20, -10)) for _ in range(nb)], vectors])
        print(f"collection name: {col_name}")
        print("Get collection entities")
        start_time = time.time()
        print(f"collection entities: {collection.num_entities}")
        end_time = time.time()
        print("Get collection entities time = %.4fs" % (end_time - start_time))
    print(f"\nList collections...")
    print(list_collections())
Пример #4
0
    def test_collection_by_DataFrame(self):
        from pymilvus import Collection
        from pymilvus import FieldSchema, CollectionSchema
        from pymilvus import DataType
        coll_name = gen_collection_name()
        fields = [
            FieldSchema("int64", DataType.INT64),
            FieldSchema("float", DataType.FLOAT),
            FieldSchema("float_vector", DataType.FLOAT_VECTOR, dim=128)
        ]

        prefix = "pymilvus.client.grpc_handler.GrpcHandler"

        collection_schema = CollectionSchema(fields, primary_field="int64")
        with mock.patch(f"{prefix}.__init__", return_value=None):
            with mock.patch(f"{prefix}._wait_for_channel_ready",
                            return_value=None):
                connections.connect()

        with mock.patch(f"{prefix}.create_collection", return_value=None):
            with mock.patch(f"{prefix}.has_collection", return_value=False):
                collection = Collection(name=coll_name,
                                        schema=collection_schema)

        with mock.patch(f"{prefix}.create_collection", return_value=None):
            with mock.patch(f"{prefix}.has_collection", return_value=True):
                with mock.patch(f"{prefix}.describe_collection",
                                return_value=collection_schema.to_dict()):
                    collection = Collection(name=coll_name)

        with mock.patch(f"{prefix}.drop_collection", return_value=None):
            with mock.patch(f"{prefix}.describe_index", return_value=None):
                collection.drop()
Пример #5
0
def gen_default_fields_with_primary_key_2():
    default_fields = [
        FieldSchema(name="int64", dtype=DataType.INT64),
        FieldSchema(name="double", dtype=DataType.DOUBLE),
        FieldSchema(name=default_float_vec_field_name, dtype=DataType.FLOAT_VECTOR, dim=default_dim)
    ]
    default_schema = CollectionSchema(fields=default_fields, description="test collection", primary_field="int64")
    return default_schema
Пример #6
0
def gen_binary_schema():
    binary_fields = [
        FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="double", dtype=DataType.DOUBLE),
        FieldSchema(name=default_binary_vec_field_name, dtype=DataType.BINARY_VECTOR, dim=default_dim)
    ]
    default_schema = CollectionSchema(fields=binary_fields, description="test collection")
    return default_schema
Пример #7
0
 def test_cmp(self, raw_dict_binary_vector):
     import copy
     field1 = FieldSchema.construct_from_dict(raw_dict_binary_vector)
     field2 = FieldSchema.construct_from_dict(raw_dict_binary_vector)
     assert field1 == field2
     dict1 = copy.deepcopy(raw_dict_binary_vector)
     dict1["name"] = dict1["name"] + "_"
     field3 = FieldSchema.construct_from_dict(dict1)
     assert field1 != field3
Пример #8
0
def gen_default_fields(auto_id=True):
    default_fields = [
        FieldSchema(name="count", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="float", dtype=DataType.FLOAT),
        FieldSchema(name=default_float_vec_field_name, dtype=DataType.FLOAT_VECTOR, dim=default_dim)
    ]
    default_schema = CollectionSchema(fields=default_fields, description="test collection",
                                      segment_row_limit=default_segment_row_limit, auto_id=False)
    return default_schema
Пример #9
0
def gen_schema():
    from pymilvus import CollectionSchema, FieldSchema
    fields = [
        FieldSchema(gen_field_name(), DataType.INT64, is_primary=True, auto_id=False),
        FieldSchema(gen_field_name(), DataType.FLOAT),
        FieldSchema(gen_field_name(), DataType.FLOAT_VECTOR, dim=default_dim)
    ]
    collection_schema = CollectionSchema(fields)
    return collection_schema
Пример #10
0
def gen_default_fields(description="test collection"):
    default_fields = [
        FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="double", dtype=DataType.DOUBLE),
        FieldSchema(name=default_float_vec_field_name,
                    dtype=DataType.FLOAT_VECTOR,
                    dim=default_dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description=description)
    return default_schema
Пример #11
0
    def test_to_dict(self, raw_dict_norm, raw_dict_float_vector,
                     raw_dict_binary_vector):
        fields = []
        dicts = [raw_dict_norm, raw_dict_float_vector, raw_dict_binary_vector]
        fields.append(FieldSchema.construct_from_dict(raw_dict_norm))
        fields.append(FieldSchema.construct_from_dict(raw_dict_float_vector))
        fields.append(FieldSchema.construct_from_dict(raw_dict_binary_vector))

        for i, f in enumerate(fields):
            target = f.to_dict()
            assert target == dicts[i]
            assert target is not dicts[i]
Пример #12
0
def create_collection(name, id_field, vector_field):
    field1 = FieldSchema(name=id_field,
                         dtype=DataType.INT64,
                         description="int64",
                         is_primary=True)
    field2 = FieldSchema(name=vector_field,
                         dtype=DataType.FLOAT_VECTOR,
                         description="float vector",
                         dim=_DIM,
                         is_primary=False)
    schema = CollectionSchema(fields=[field1, field2],
                              description="collection description")
    collection = Collection(name=name, data=None, schema=schema)
    print("\ncollection created:", name)
    return collection
Пример #13
0
 def test_constructor_from_float_dict(self, raw_dict_float_vector):
     field = FieldSchema.construct_from_dict(raw_dict_float_vector)
     assert field.dtype == DataType.FLOAT_VECTOR
     assert field.description == raw_dict_float_vector['description']
     assert field.is_primary == False
     assert field.name == raw_dict_float_vector['name']
     assert field.dim == raw_dict_float_vector['params']['dim']
Пример #14
0
 def test_constructor_from_norm_dict(self, raw_dict_norm):
     field = FieldSchema.construct_from_dict(raw_dict_norm)
     assert field.dtype == DataType.INT64
     assert field.description == raw_dict_norm['description']
     assert field.is_primary == False
     assert field.name == raw_dict_norm['name']
     assert field.dim is None
     assert field.dummy is None
Пример #15
0
def create_collections_and_insert_data(prefix,
                                       flush=True,
                                       count=3000,
                                       collection_cnt=11):
    import random
    dim = 128
    nb = count // 10
    default_fields = [
        FieldSchema(name="count", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="random_value", dtype=DataType.DOUBLE),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")
    for index_name in all_index_types[:collection_cnt]:
        print("\nCreate collection...")
        col_name = prefix + index_name
        collection = Collection(name=col_name, schema=default_schema)
        print(f"collection name: {col_name}")
        print(f"begin insert, count: {count} nb: {nb}")
        times = int(count // nb)
        total_time = 0.0
        vectors = [[random.random() for _ in range(dim)] for _ in range(count)]
        for j in range(times):
            start_time = time.time()
            collection.insert(
                [[i for i in range(nb * j, nb * j + nb)],
                 [float(random.randrange(-20, -10)) for _ in range(nb)],
                 vectors[nb * j:nb * j + nb]])
            end_time = time.time()
            print(
                f"[{j+1}/{times}] insert {nb} data, time: {end_time - start_time:.4f}"
            )
            total_time += end_time - start_time

        print(f"end insert, time: {total_time:.4f}")
        if flush:
            print("Get collection entities")
            start_time = time.time()
            print(f"collection entities: {collection.num_entities}")
            end_time = time.time()
            print("Get collection entities time = %.4fs" %
                  (end_time - start_time))
    print("\nList collections...")
    print(get_collections(prefix))
Пример #16
0
def create_collection(name, id_field, vector_field, str_field):
    field1 = FieldSchema(name=id_field,
                         dtype=DataType.INT64,
                         description="int64",
                         is_primary=True)
    field2 = FieldSchema(name=vector_field,
                         dtype=DataType.FLOAT_VECTOR,
                         description="float vector",
                         dim=_DIM,
                         is_primary=False)
    field3 = FieldSchema(name=str_field,
                         dtype=DataType.VARCHAR,
                         description="string",
                         max_len_per_row=_MAX_LEN_PER_ROW,
                         is_primary=False)
    schema = CollectionSchema(fields=[field1, field2, field3],
                              description="collection description")
    collection = Collection(name=name, data=None, schema=schema)
    print("\ncollection created:", name)
    return collection
Пример #17
0
def hello_milvus(host="127.0.0.1"):
    import time
    # create connection
    connections.connect(host=host, port="19530")

    print(f"\nList collections...")
    print(list_collections())

    # create collection
    dim = 128
    default_fields = [
        FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="float", dtype=DataType.FLOAT),
        FieldSchema(name="varchar", dtype=DataType.VARCHAR, max_length=65535),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")

    print(f"\nCreate collection...")
    collection = Collection(name="hello_milvus", schema=default_schema)

    print(f"\nList collections...")
    print(list_collections())

    #  insert data
    nb = 3000
    vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
    t0 = time.time()
    collection.insert([[i for i in range(nb)],
                       [np.float32(i) for i in range(nb)],
                       [str(i) for i in range(nb)], vectors])
    t1 = time.time()
    print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds")

    t0 = time.time()
    print(f"\nGet collection entities...")
    print(collection.num_entities)
    t1 = time.time()
    print(f"\nGet collection entities cost {t1 - t0:.4f} seconds")

    # create index and load table
    default_index = {
        "index_type": "IVF_SQ8",
        "metric_type": "L2",
        "params": {
            "nlist": 64
        }
    }
    print(f"\nCreate index...")
    t0 = time.time()
    collection.create_index(field_name="float_vector",
                            index_params=default_index)
    t1 = time.time()
    print(f"\nCreate index cost {t1 - t0:.4f} seconds")
    print("\nGet replicas number")
    try:
        replicas_info = collection.get_replicas()
        replica_number = len(replicas_info.groups)
        print(f"\nReplicas number is {replica_number}")
    except Exception as e:
        print(str(e))
        replica_number = 1
    print(f"\nload collection...")
    t0 = time.time()
    collection.load(replica_number=replica_number)
    t1 = time.time()
    print(f"\nload collection cost {t1 - t0:.4f} seconds")

    # load and search
    topK = 5
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    t0 = time.time()
    print(f"\nSearch...")
    # define output_fields of search result
    res = collection.search(vectors[-2:],
                            "float_vector",
                            search_params,
                            topK,
                            "int64 > 100",
                            output_fields=["int64", "float"],
                            timeout=TIMEOUT)
    t1 = time.time()
    print(f"search cost  {t1 - t0:.4f} seconds")
    # show result
    for hits in res:
        for hit in hits:
            # Get value of the random value field for search result
            print(hit, hit.entity.get("float"))

    # query
    expr = "int64 in [2,4,6,8]"
    output_fields = ["int64", "float"]
    res = collection.query(expr, output_fields, timeout=TIMEOUT)
    sorted_res = sorted(res, key=lambda k: k['int64'])
    for r in sorted_res:
        print(r)
Пример #18
0
#################################################################################
# 2. create collection
# We're going to create a collection with 3 fields.
# +-+------------+------------+------------------+------------------------------+
# | | field name | field type | other attributes |       field description      |
# +-+------------+------------+------------------+------------------------------+
# |1|    "pk"    |    Int64   |  is_primary=True |      "primary field"         |
# | |            |            |   auto_id=False  |                              |
# +-+------------+------------+------------------+------------------------------+
# |2|  "random"  |    Double  |                  |      "a double field"        |
# +-+------------+------------+------------------+------------------------------+
# |3|"embeddings"| FloatVector|     dim=8        |  "float vector with dim 8"   |
# +-+------------+------------+------------------+------------------------------+
fields = [
    FieldSchema(name="pk",
                dtype=DataType.INT64,
                is_primary=True,
                auto_id=False),
    FieldSchema(name="random", dtype=DataType.DOUBLE),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(
    fields, "hello_milvus is the simplest demo to introduce the APIs")

print(fmt.format("Create collection `hello_milvus`"))
hello_milvus = Collection("hello_milvus", schema, consistency_level="Strong")

################################################################################
# 3. insert data
# We are going to insert 3000 rows of data into `hello_milvus`
# Data to be inserted must be organized in fields.
Пример #19
0
def hello_milvus(host="127.0.0.1"):
    import time
    # create connection
    connections.connect(host=host, port="19530")

    print(f"\nList collections...")
    print(list_collections())

    # create collection
    dim = 128
    default_fields = [
        FieldSchema(name="count", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="random_value", dtype=DataType.DOUBLE),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")

    print(f"\nCreate collection...")
    collection = Collection(name="hello_milvus", schema=default_schema)

    print(f"\nList collections...")
    print(list_collections())

    #  insert data
    nb = 3000
    vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
    t0 = time.time()
    collection.insert([[i for i in range(nb)],
                       [float(random.randrange(-20, -10)) for _ in range(nb)],
                       vectors])
    t1 = time.time()
    print(f"\nInsert {nb} vectors cost {t1 - t0} seconds")

    t0 = time.time()
    print(f"\nGet collection entities...")
    print(collection.num_entities)
    t1 = time.time()
    print(f"\nGet collection entities cost {t1 - t0} seconds")

    # create index and load table
    default_index = {
        "index_type": "IVF_FLAT",
        "params": {
            "nlist": 128
        },
        "metric_type": "L2"
    }
    print(f"\nCreate index...")
    t0 = time.time()
    collection.create_index(field_name="float_vector",
                            index_params=default_index)
    t1 = time.time()
    print(f"\nCreate index cost {t1 - t0} seconds")
    print(f"\nload collection...")
    t0 = time.time()
    collection.load()
    t1 = time.time()
    print(f"\nload collection cost {t1 - t0} seconds")

    # load and search
    topK = 5
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    start_time = time.time()
    print(f"\nSearch...")
    # define output_fields of search result
    res = collection.search(vectors[-2:],
                            "float_vector",
                            search_params,
                            topK,
                            "count > 100",
                            output_fields=["count", "random_value"])
    end_time = time.time()

    # show result
    for hits in res:
        for hit in hits:
            # Get value of the random value field for search result
            print(hit, hit.entity.get("random_value"))
    print("search latency = %.4fs" % (end_time - start_time))

    # query
    expr = "count in [2,4,6,8]"
    output_fields = ["count", "random_value"]
    res = collection.query(expr, output_fields)
    sorted_res = sorted(res, key=lambda k: k['count'])
    for r in sorted_res:
        print(r)
Пример #20
0
def hello_milvus(collection_name):
    import time
    # create collection
    dim = 128
    default_fields = [
        FieldSchema(name="int64", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="float", dtype=DataType.FLOAT),
        FieldSchema(name="varchar",
                    dtype=DataType.VARCHAR,
                    max_length_per_row=65535),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")
    if utility.has_collection(collection_name):
        print("collection is exist")
        collection = Collection(name=collection_name)
        default_schema = collection.schema
        dim = [
            field.params['dim'] for field in default_schema.fields
            if field.dtype in [101, 102]
        ][0]
    print(f"\nCreate collection...")
    collection = Collection(name=collection_name, schema=default_schema)
    #  insert data
    nb = 3000
    vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
    t0 = time.time()

    collection.insert([[i for i in range(nb)],
                       [np.float32(i) for i in range(nb)],
                       [str(i) for i in range(nb)], vectors])
    t1 = time.time()
    print(f"\nInsert {nb} vectors cost {t1 - t0:.4f} seconds")

    t0 = time.time()
    print(f"\nGet collection entities...")
    print(collection.num_entities)
    t1 = time.time()
    print(f"\nGet collection entities cost {t1 - t0:.4f} seconds")

    # create index and load table
    default_index = {
        "index_type": "IVF_SQ8",
        "metric_type": "L2",
        "params": {
            "nlist": 64
        }
    }
    print(f"\nCreate index...")
    t0 = time.time()
    collection.create_index(field_name="float_vector",
                            index_params=default_index)
    t1 = time.time()
    print(f"\nCreate index cost {t1 - t0:.4f} seconds")
    print(f"\nload collection...")
    t0 = time.time()
    collection.load()
    t1 = time.time()
    print(f"\nload collection cost {t1 - t0:.4f} seconds")

    # load and search
    topK = 5
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    t0 = time.time()
    print(f"\nSearch...")
    # define output_fields of search result
    res = collection.search(vectors[-2:],
                            "float_vector",
                            search_params,
                            topK,
                            "int64 > 100",
                            output_fields=["int64", "float"],
                            timeout=TIMEOUT)
    t1 = time.time()
    print(f"search cost  {t1 - t0:.4f} seconds")
    # show result
    for hits in res:
        for hit in hits:
            # Get value of the random value field for search result
            print(hit, hit.entity.get("float"))

    # query
    expr = "int64 in [2,4,6,8]"
    output_fields = ["int64", "float"]
    res = collection.query(expr, output_fields, timeout=TIMEOUT)
    sorted_res = sorted(res, key=lambda k: k['int64'])
    for r in sorted_res:
        print(r)
    collection.release()
Пример #21
0
from pymilvus import (connections, list_collections, FieldSchema,
                      CollectionSchema, DataType, Collection, Partition,
                      utility)

# configure milvus hostname and port
print(f"\nCreate connection...")
connections.connect()

# List all collection names
print(f"\nList collections...")
print(list_collections())

# Create a collection named 'demo_film_tutorial'
print(f"\nCreate collection...")
field1 = FieldSchema(name="release_year",
                     dtype=DataType.INT64,
                     description="int64",
                     is_primary=True)
field2 = FieldSchema(name="embedding",
                     dtype=DataType.FLOAT_VECTOR,
                     description="float vector",
                     dim=8,
                     is_primary=False)
schema = CollectionSchema(fields=[field1, field2],
                          description="collection description")
collection = Collection(name='demo_film_tutorial', data=None, schema=schema)

# List all collection names
print(f"\nList collections...")
print(list_collections())

print(f"\nGet collection name, schema and description...")
Пример #22
0
def hello_milvus():
    # create connection
    connections.connect()

    print(f"\nList collections...")
    print(list_collections())

    # create collection
    dim = 128
    default_fields = [
        FieldSchema(name="count", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="random_value", dtype=DataType.DOUBLE),
        FieldSchema(name="float_vector", dtype=DataType.FLOAT_VECTOR, dim=dim)
    ]
    default_schema = CollectionSchema(fields=default_fields,
                                      description="test collection")

    print(f"\nCreate collection...")
    collection = Collection(name="hello_milvus", schema=default_schema)

    print(f"\nList collections...")
    print(list_collections())

    #  insert data
    nb = 3000
    vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
    collection.insert([[i for i in range(nb)],
                       [float(random.randrange(-20, -10)) for _ in range(nb)],
                       vectors])

    print(f"\nGet collection entities...")
    print(collection.num_entities)

    # create index and load table
    default_index = {
        "index_type": "IVF_FLAT",
        "params": {
            "nlist": 128
        },
        "metric_type": "L2"
    }
    print(f"\nCreate index...")
    collection.create_index(field_name="float_vector",
                            index_params=default_index)
    print(f"\nload collection...")
    collection.load()

    # load and search
    topK = 5
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    import time
    start_time = time.time()
    print(f"\nSearch...")
    # define output_fields of search result
    res = collection.search(vectors[-2:],
                            "float_vector",
                            search_params,
                            topK,
                            "count > 100",
                            output_fields=["count", "random_value"])
    end_time = time.time()

    # show result
    for hits in res:
        for hit in hits:
            # Get value of the random value field for search result
            print(hit, hit.entity.get("random_value"))
    print("search latency = %.4fs" % (end_time - start_time))

    # drop collection
    collection.drop()