def new_application(app_name, fields, s3_bucket): ok, message = fields_check(fields) if not ok: raise ArgsCheckError(message, "") try: # check application exist if MongoIns.search_by_name(APPLICATION_COLLECTION_NAME, app_name): raise ExistError(f"application <{app_name}> had exist", "") except ExistError: raise try: for _, value in fields.items(): if value.get("type") == "pipeline": pipe = MongoIns.search_by_name(PIPELINE_COLLECTION_NAME, value.get("value"))[0] ei = identity( pipe.get("encoder").get("instance").get("endpoint")) name = f"{app_name}_{pipe.get('encoder').get('instance').get('name').replace('phantoscope_', '')}" MilvusIns.new_milvus_collection(name, int(ei["dimension"]), 1024, "l2") # create a application entity collection MongoIns.new_mongo_collection(f"{app_name}_entity") S3Ins.new_s3_buckets(s3_bucket) # create milvus collections app = Application(name=app_name, fields=fields, bucket=s3_bucket) app.metadata = app._metadata() MongoIns.insert_documents(APPLICATION_COLLECTION_NAME, app.to_dict()) return app except Exception as e: logger.error("error happen during create app: %s", str(e), exc_info=True) raise e
def new_pipeline(name, input, index_file_size, processors, encoder, description=None): try: encoder = operator_detail(encoder) pipe = Pipeline(name=name, input=input, output=encoder.output, dimension=encoder.dimension, index_file_size=index_file_size, metric_type=encoder.metric_type, description=description, processors=processors.split(","), encoder=encoder.name) if pipeline_ilegal(pipe): return PipelineIlegalError("Pipeline ilegal check error", "") milvus_collection_name = f"{name}_{encoder.name}" MilvusIns.new_milvus_collection(milvus_collection_name, encoder.dimension, index_file_size, encoder.metric_type) return pipe.save() except Exception as e: print(e) logger.error(e) return e
def delete_milvus_collections_by_fields(app): for _, field in app['fields'].items(): if field["type"] == "pipeline": pipe = MongoIns.search_by_name(PIPELINE_COLLECTION_NAME, field.get("value"))[0] name = f"{app.get('name')}_{pipe.get('encoder').get('instance').get('name').replace('phantoscope_', '')}" MilvusIns.del_milvus_collection(name)
def delete_entity(app_name, entity_id): try: mongo_ins_name = f"{app_name}_entity" entity = MongoIns.search_by_id(mongo_ins_name, entity_id) if not entity.count(): raise NotExistError("Entity %s not exist" % entity_id, "NotExistError") for item in entity: en = new_mapping_ins(item) for name, fields in en._docs.items(): # delete s3 object bucket_name = fields.get("url").split("/")[-2] object_name = fields.get("url").split("/")[-1] S3Ins.del_object(bucket_name, object_name) # delete vector from milvus vids = fields.get("ids") app = application_detail(app_name) pipe_name = app.fields[name]["value"] pipe = pipeline_detail(pipe_name) instance_name = pipe.encoder.get("instance") MilvusIns.del_vectors(f"{app_name}_{name}_{instance_name}", vids) # delete from mongodb MongoIns.delete_by_id(mongo_ins_name, entity_id) logger.info("delete entity %s in application %s", entity_id, app_name) return en except Exception as e: logger.error(e) raise e
def delete_milvus_collections_by_fields(app): for _, field in app.fields.items(): if field["type"] == "pipeline": pipe = pipeline_detail(field["value"]) name = pipe.encoder.get("name") instance_name = pipe.encoder.get("instance") MilvusIns.del_milvus_collection( f"{app.name}_{name}_{instance_name}")
def create_milvus_collections_by_fields(app): for field in search_fields(app.fields): if field.type == "pipeline": pipe = pipeline_detail(field.value) name = pipe.encoder.get("name") instance_name = pipe.encoder.get("instance") encoder = operator_detail(name) instance = encoder.inspect_instance(instance_name) ei = identity(instance.endpoint) MilvusIns.new_milvus_collection( f"{app.name}_{name}_{instance_name}", int(ei["dimension"]), 1024, "l2")
def search(name, fields={}, topk=10, nprobe=16): res = [] try: app = application_detail(name) accept_fields = [x for x, y in app.fields.items() if y.get('type') != "object"] pipeline_fields = {x: y['pipeline'] for x, y in app.fields.items() if y.get('type') == "object"} for k, _ in fields.items(): if k not in accept_fields and k not in pipeline_fields: raise RequestError(f"fields {k} not in application", "") for n, p in pipeline_fields.items(): pipe = pipeline_detail(p) value = fields.get(n) file_data = value.get('data') url = value.get('url') if not file_data and not url: raise RequestError("can't find data or url from request", "") vectors = run_pipeline(pipe, data=file_data, url=url) if not vectors: raise NoneVectorError("can't encode data by encoder, check input or encoder", "") milvus_collection_name = f"{pipe.name}_{pipe.encoder}" vids = MilvusIns.search_vectors(milvus_collection_name, vectors, topk=topk, nprobe=nprobe) # here add scoreling function dbs = search_ids_from_mapping([x.id for x in vids[0]]) for db in dbs: m = new_mapping_ins(id=db.id, app_name=db.app_name, image_url=db.image_url, fields=db.fields) res.append(m) return res except Exception as e: raise e
def upload(name, **kwargs): try: app = application_detail(name) if not app: raise NotExistError("application not exist", "application %s not exist" % name) bucket_name = app.buckets.split(",")[0] accept_fields = [x for x, y in app.fields.items() if y.get('type') != "object"] pipeline_fields = {x: y['pipeline'] for x, y in app.fields.items() if y.get('type') == "object"} new_fields = app.fields.copy() for k, v in kwargs.items(): if k in accept_fields: new_fields[k]['value'] = v res = [] for k, _ in kwargs.get('fields').items(): if k not in accept_fields and k not in pipeline_fields: raise RequestError(f"fields {k} not in application", "") for n, p in pipeline_fields.items(): pipe = pipeline_detail(p) if not pipe: raise NotExistError("pipeline not exist", "pipeline %s not exist" % p) value = kwargs['fields'].get(n) file_data = value.get('data') url = value.get('url') if not file_data and not url: raise RequestError("can't find data or url from request", "") file_name = "{}-{}".format(name, uuid.uuid4().hex) file_path = save_tmp_file(file_name, file_data, url) # begin to timing start = time.time() S3Ins.upload2bucket(bucket_name, file_path, file_name) upload_time = time.time() logger.debug("[timing] upload image to bucket costs: {:.3f}s".format(upload_time - start)) vectors = run_pipeline(pipe, data=file_data, url=url) pipeline_time = time.time() logger.debug("[timing] run pipeline costs: {:.3f}s".format(pipeline_time - upload_time)) milvus_collection_name = f"{pipe.name}_{pipe.encoder}" vids = MilvusIns.insert_vectors(milvus_collection_name, vectors) insert_time = time.time() logger.debug("[timing] insert to milvus costs: {:.3f}s".format(insert_time - pipeline_time)) for vid in vids: m = DB(id=vid, app_name=name, image_url=gen_url(bucket_name, file_name), fields=new_fields) add_mapping_data(m) res.append(new_mapping_ins(id=vid, app_name=name, image_url=gen_url(bucket_name, file_name), fields=new_fields)) final_time = time.time() logger.debug("[timing] prepare result costs: {:.3f}s".format(final_time - insert_time)) return res except Exception as e: print(e) return e
def upload(name, **kwargs): try: app = application_detail(name) if not app: raise NotExistError("application not exist", "application %s not exist" % name) bucket_name = app.buckets.split(",")[0] accept_fields = [ x for x, y in app.fields.items() if y.get('type') != "object" ] pipeline_fields = { x: y['pipeline'] for x, y in app.fields.items() if y.get('type') == "object" } new_fields = app.fields.copy() for k, v in kwargs.items(): if k in accept_fields: new_fields[k]['value'] = v res = [] for k, _ in kwargs.get('fields').items(): if k not in accept_fields and k not in pipeline_fields: raise RequestError(f"fields {k} not in application", "") for n, p in pipeline_fields.items(): pipe = pipeline_detail(p) if not pipe: raise NotExistError("pipeline not exist", "pipeline %s not exist" % p) value = kwargs['fields'].get(n) file_data = value.get('data') url = value.get('url') if not file_data and not url: raise RequestError("can't find data or url from request", "") file_name = "{}-{}".format(name, uuid.uuid4().hex) file_path = save_tmp_file(file_name, file_data, url) S3Ins.upload2bucket(bucket_name, file_path, file_name) vectors = run_pipeline(pipe, data=file_data, url=url) if not vectors: raise NoneVectorError( "can't encode data by encoder, check input or encoder", "") milvus_collection_name = f"{pipe.name}_{pipe.encoder}" vids = MilvusIns.insert_vectors(milvus_collection_name, vectors) for vid in vids: m = DB(id=vid, app_name=name, image_url=gen_url(bucket_name, file_name), fields=new_fields) add_mapping_data(m) res.append( new_mapping_ins(id=vid, app_name=name, image_url=gen_url(bucket_name, file_name), fields=new_fields)) return res except Exception as e: print(e) return e
def delete_entity(app_name, entity_name): try: entity = search_from_mapping(entity_name) if not entity: raise NotExistError("Entity %s not exist" % entity_name, "NotExistError") MilvusIns.del_vectors(app_name, [int(entity_name)]) bucket_name = entity.image_url.split("/")[-2] object_name = entity.image_url.split("/")[-1] S3Ins.del_object(bucket_name, object_name) del_mapping(entity_name) logger.info("delete entity %s in application %s", entity_name, app_name) return new_mapping_ins(id=entity.id, app_name=entity.app_name, image_url=entity.image_url, fields=entity.fields) except Exception as e: logger.error(e) return e
def upload(name, **kwargs): try: app = application_detail(name) if not app: raise NotExistError("application not exist", "application %s not exist" % name) bucket_name = app.buckets.split(",")[0] accept_fields = [x for x, y in app.fields.items() if y.get('type') != "pipeline"] pipeline_fields = {x: y['value'] for x, y in app.fields.items() if y.get('type') == "pipeline"} new_fields = app.fields.copy() for k, v in kwargs.items(): if k in accept_fields: new_fields[k]['value'] = v res = [] for k, _ in kwargs.get('fields').items(): if k not in accept_fields and k not in pipeline_fields: raise RequestError(f"fields {k} not in application", "") docs = {} valid_field_flag = False for n, p in pipeline_fields.items(): pipe = pipeline_detail(p) if not pipe: raise NotExistError("pipeline not exist", "pipeline %s not exist" % p) value = kwargs['fields'].get(n) if not value: continue valid_field_flag = True file_data = value.get('data') url = value.get('url') if not file_data and not url: raise RequestError("can't find data or url from request", "") file_name = "{}-{}".format(name, uuid.uuid4().hex) file_path = save_tmp_file(file_name, file_data, url) S3Ins.upload2bucket(bucket_name, file_path, file_name) vectors = run_pipeline(pipe, data=file_data, url=url) if not vectors: raise NoneVectorError("can't encode data by encoder, check input or encoder", "") milvus_collection_name = f"{app.name}_{pipe.encoder['name']}_{pipe.encoder['instance']}" vids = MilvusIns.insert_vectors(milvus_collection_name, vectors) docs[n] = {"ids": vids, "url": gen_url(bucket_name, file_name)} doc_id = MongoIns.insert_documents(f"{app.name}_entity", docs) res.append(new_mapping_ins(docs)) if not valid_field_flag: raise RequestError("none valid field exist", "") return res except Exception as e: err_msg = f"Unexpected error happen when upload: {str(e)}" logger.error(err_msg, exc_info=True) raise UnexpectedError(err_msg, e)
def search_and_score(milvus_collection_name, mongo_name, field_name, vectors, topk, nprobe, inner_score_mode: str): """ search vectors from milvus and score by inner field score mode :param milvus_collection_name: collection name will be search :param mongo_name: mongo collection name will be selected from :param field_name: field name for searching from mongodb :param vectors: vectors which will be searched in milvus :param topk: milvus topk number :param nprobe: milvus nprobe number :param inner_score_mode: :return: image id of entity """ result_dbs = [] MAX_TOPK = 2048 magic_number = 60 increase_rate = 0.1 query_topk = topk + magic_number end_flag = False try: inner_score_mode = InnerFieldScoreMode(inner_score_mode) except Exception as e: raise WrongInnerFieldModeError("Unsupported inner field mode", e) while (len(result_dbs) < topk) and (not end_flag): # check query topk max value query_topk = min(query_topk, MAX_TOPK) vids = MilvusIns.search_vectors(milvus_collection_name, vectors, topk=query_topk, nprobe=nprobe) if len(vids) == 0: raise NoneVectorError("milvus search result is None", "") # filter -1 and if exist -1 or len(vids) < topk if (-1 in vids.id_array[0]) or len(vids[0]) < query_topk: end_flag = True # inner field score function here res_vids = get_inner_field_score_result(vids, query_topk, inner_score_mode) if len(res_vids) < topk: if query_topk < MAX_TOPK: # calc a new query_topk and needn't to query from mysql query_topk += math.ceil(query_topk * increase_rate) increase_rate *= 2 if not end_flag: continue end_flag = True result_dbs = MongoIns.search_by_vector_id(mongo_name, field_name, res_vids) # calc a new query_topk if len(result_dbs) < topk query_topk += math.ceil(query_topk * increase_rate) return result_dbs[:topk]
def delete_pipeline(name): try: p = del_pipeline(name) if not p: raise NotExistError("pipeline %s is not exist" % name, "") p = p[0] milvus_collection_name = f"{name}_{p.encoder}" MilvusIns.del_milvus_collection(milvus_collection_name) pipe = Pipeline(name=p.name, input=p.input, output=p.output, dimension=p.dimension, index_file_size=p.index_file_size, metric_type=p.metric_type, description=p.description, processors=p.processors.split(","), encoder=p.encoder) return pipe except Exception as e: logger.error(e) return e
def upload(name, **kwargs): try: app = application_detail(name) if not app: raise NotExistError("application not exist", "application %s not exist" % name) bucket_name = app.buckets.split(",")[0] accept_fields = [x for x, y in app.fields.items() if y.get('type') != "pipeline"] pipeline_fields = {x: y['value'] for x, y in app.fields.items() if y.get('type') == "pipeline"} new_fields = app.fields.copy() for k, v in kwargs.items(): if k in accept_fields: new_fields[k]['value'] = v res = [] for k, _ in kwargs.get('fields').items(): if k not in accept_fields and k not in pipeline_fields: raise RequestError(f"fields {k} not in application", "") docs = {} for n, p in pipeline_fields.items(): pipe = pipeline_detail(p) if not pipe: raise NotExistError("pipeline not exist", "pipeline %s not exist" % p) value = kwargs['fields'].get(n) file_data = value.get('data') url = value.get('url') if not file_data and not url: raise RequestError("can't find data or url from request", "") file_name = "{}-{}".format(name, uuid.uuid4().hex) file_path = save_tmp_file(file_name, file_data, url) S3Ins.upload2bucket(bucket_name, file_path, file_name) vectors = run_pipeline(pipe, data=file_data, url=url) milvus_collection_name = f"{app.name}_{pipe.encoder['name']}_{pipe.encoder['instance']}" vids = MilvusIns.insert_vectors(milvus_collection_name, vectors) docs[n] = {"ids": vids, "url": gen_url(bucket_name, file_name)} doc_id = MongoIns.insert_documents(f"{app.name}_entity", docs) res.append(new_mapping_ins(docs)) fields=new_fields)) return res
def test_new_collection(self): """create new collection""" rv = MilvusIns.new_milvus_collection(self.name, self.dimension, self.index_file_size, self.metric_type) assert rv == None
def test_insert_vectors(self): """test insert vectors""" vectors = [[random.random() for _ in range(self.dimension)] for _ in range(20)] rv = MilvusIns.insert_vectors(self.name, vectors) assert len(rv) == 20
def test_search_vectors(self): """test search vectors""" q_records = [[random.random() for _ in range(self.dimension)]] rv = MilvusIns.search_vectors(self.name, q_records, 10, 16)
def test_del_milvus_collection(self): """drop collection""" rv = MilvusIns.del_milvus_collection(self.name) assert rv == None