vector_ids = [] id_query = {} with open("embed.txt","r")as f: for line in f: line_lst = line.strip().split("\t") vectors.append(list(map(float,(line_lst[2].split())))) vector_ids.append(int(line_lst[1])) id_query[int(line_lst[1])] = line_lst[0] vectors = np.array(vectors) vectors = vectors.astype("float32") # 创建索引 quantizer = faiss.IndexFlatL2(vec_dim) # 使用欧式距离作为度量 nlist = 16384 faiss_index = faiss.IndexIVFFlat(quantizer, vec_dim, nlist, faiss.METRIC_L2) faiss_index.nprobe = 16 #ssert not index.is_trained faiss_index.train(vectors) faiss_index.add(vectors) faiss.write_index(faiss_index,"large.index") # 查询向量 假设有5个 query_vectors = vectors[:20] # 搜索结果 # 分别是 每条记录对应topk的距离和索引 # ndarray类型 。shape:len(query_vectors)*topk res_distance, res_index = faiss_index.search(query_vectors, 5) t = time.time()
def find_nearest_neighbors( target, emb, k=5, metric="euclidean", gpu_id=None, exact=True ): """Find the nearest neighbors for each point. :param emb: vectors for the points for which we find the nearest neighbors :type emb: numpy.ndarray (num_entities, dim) :param emb: vectors for the points from which we find the nearest neighbors. :type emb: numpy.ndarray (num_entities, dim) :param k: Number of nearest neighbors, defaults to 5 :type k: int, optional :paramm metric: Distance metric for finding nearest neighbors. Available metric `metric="euclidean"`, `metric="cosine"` , `metric="dotsim"` :type metric: str :return: IDs of emb (indices), and similarity (distances) :rtype: indices (numpy.ndarray), distances (numpy.ndarray) .. highlight:: python .. code-block:: python >>> import emlens >>> import numpy as np >>> emb = np.random.randn(100, 20) >>> target = np.random.randn(10, 20) >>> A = emlens.find_nearest_neighbors(target, emb, k = 10) """ if emb.flags["C_CONTIGUOUS"]: emb = emb.copy(order="C") if target.flags["C_CONTIGUOUS"]: target = target.copy(order="C") emb = emb.astype(np.float32) target = target.astype(np.float32) # Find the nearest neighbors if metric == "euclidean": if exact: index = faiss.IndexFlatL2(emb.shape[1]) else: quantiser = faiss.IndexFlatL2(emb.shape[1]) nlist = int(np.ceil(10 * np.sqrt(emb.shape[0]))) index = faiss.IndexIVFFlat(quantiser, emb.shape[1], nlist, faiss.METRIC_L2) index.train(emb) elif metric == "cosine": denom = np.array(np.linalg.norm(emb, axis=1)).reshape(-1) denom[np.isclose(denom, 0)] = 1 emb = np.einsum("i,ij->ij", 1 / denom, emb) denom = np.array(np.linalg.norm(target, axis=1)).reshape(-1) denom[np.isclose(denom, 0)] = 1 target = np.einsum("i,ij->ij", 1 / denom, target) if exact: index = faiss.IndexFlatIP(emb.shape[1]) else: quantiser = faiss.IndexFlatIP(emb.shape[1]) nlist = int(np.ceil(10 * np.sqrt(emb.shape[0]))) index = faiss.IndexIVFFlat( quantiser, emb.shape[1], nlist, faiss.METRIC_INNER_PRODUCT ) index.train(emb) elif metric == "dotsim": if exact: index = faiss.IndexFlatIP(emb.shape[1]) else: quantiser = faiss.IndexFlatIP(emb.shape[1]) nlist = int(np.ceil(10 * np.sqrt(emb.shape[0]))) index = faiss.IndexIVFFlat( quantiser, emb.shape[1], nlist, faiss.METRIC_INNER_PRODUCT ) index.train(emb) else: raise NotImplementedError("does not support metric: {}".format(metric)) if gpu_id is None: gpu_id = 0 if k >= 2048: # if k is larger than that supported by GPU index.add(emb) else: try: res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, gpu_id, index) index.add(emb) except (RuntimeError, AttributeError): index.add(emb) distances, neighbors = index.search(target, k=k) assert distances.dtype == "float32" assert neighbors.dtype == "int64" nodes = (np.arange(target.shape[0]).reshape((-1, 1)) @ np.ones((1, k))).astype(int) neighbors = neighbors.astype(int) return nodes, neighbors, distances
print(index.ntotal) serarch_i = np.asarray([data[700], data[100]]) k = 4 # we want to see 4 nearest neighbors start = time.clock() D, I = index.search(serarch_i[:5], k) # sanity check end = time.clock() print end - start # ---------------------------------------------------------- # 加快搜索 nlist = 100 # 聚类中心的个数 k = 4 quantizer = faiss.IndexFlatL2(d) # the other index index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) # here we specify METRIC_L2, by default it performs inner-product search assert not index.is_trained index.train(data) assert index.is_trained start_k_suoyin = time.clock() index.add(data) # add may be a bit slower as well end_k_suoyin = time.clock() print 'the time in add kmeans:' print end_k_suoyin - start_k_suoyin serarch_a = np.asarray([data[700], data[200]]) # print serarch_a start = time.clock()
def __init__(self, load_path: str, word_to_idlist_filename: str, entities_list_filename: str, entities_ranking_filename: str, vectorizer_filename: str, faiss_index_filename: str, chunker: NerChunker = None, ner: Chainer = None, ner_parser: EntityDetectionParser = None, entity_ranker: RelRankerBertInfer = None, num_faiss_candidate_entities: int = 20, num_entities_for_bert_ranking: int = 50, num_faiss_cells: int = 50, use_gpu: bool = True, save_path: str = None, fit_vectorizer: bool = False, max_tfidf_features: int = 1000, include_mention: bool = False, ngram_range: List[int] = None, num_entities_to_return: int = 10, lang: str = "ru", use_descriptions: bool = True, lemmatize: bool = False, **kwargs) -> None: """ Args: load_path: path to folder with inverted index files word_to_idlist_filename: file with dict of words (keys) and start and end indices in entities_list filename of the corresponding entity ids entities_list_filename: file with the list of entity ids from the knowledge base entities_ranking_filename: file with dict of entity ids (keys) and number of relations in Wikidata for entities vectorizer_filename: filename with TfidfVectorizer data faiss_index_filename: file with Faiss index of words chunker: component deeppavlov.models.kbqa.ner_chunker ner: config for entity detection ner_parser: component deeppavlov.models.kbqa.entity_detection_parser entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert_infer num_faiss_candidate_entities: number of nearest neighbors for the entity substring from the text num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context num_faiss_cells: number of Voronoi cells for Faiss index use_gpu: whether to use GPU for faster search of candidate entities save_path: path to folder with inverted index files fit_vectorizer: whether to build index with Faiss library max_tfidf_features: maximal number of features for TfidfVectorizer include_mention: whether to leave entity mention in the context (during BERT ranking) ngram_range: char ngrams range for TfidfVectorizer num_entities_to_return: number of candidate entities for the substring which are returned lang: russian or english use_description: whether to perform entity ranking by context and description lemmatize: whether to lemmatize tokens **kwargs: """ super().__init__(save_path=save_path, load_path=load_path) self.morph = pymorphy2.MorphAnalyzer() self.lemmatize = lemmatize self.word_to_idlist_filename = word_to_idlist_filename self.entities_list_filename = entities_list_filename self.entities_ranking_filename = entities_ranking_filename self.vectorizer_filename = vectorizer_filename self.faiss_index_filename = faiss_index_filename self.num_entities_for_bert_ranking = num_entities_for_bert_ranking self.num_faiss_candidate_entities = num_faiss_candidate_entities self.num_faiss_cells = num_faiss_cells self.use_gpu = use_gpu self.chunker = chunker self.ner = ner self.ner_parser = ner_parser self.entity_ranker = entity_ranker self.fit_vectorizer = fit_vectorizer self.max_tfidf_features = max_tfidf_features self.include_mention = include_mention self.ngram_range = ngram_range self.num_entities_to_return = num_entities_to_return self.lang_str = f"@{lang}" if self.lang_str == "@en": self.stopwords = set(stopwords.words("english")) elif self.lang_str == "@ru": self.stopwords = set(stopwords.words("russian")) self.use_descriptions = use_descriptions self.load() if self.fit_vectorizer: self.vectorizer = TfidfVectorizer( analyzer="char_wb", ngram_range=tuple(self.ngram_range), max_features=self.max_tfidf_features, max_df=0.85) self.vectorizer.fit(self.word_list) self.matrix = self.vectorizer.transform(self.word_list) self.dense_matrix = self.matrix.toarray() if self.num_faiss_cells > 1: quantizer = faiss.IndexFlatIP(self.max_tfidf_features) self.faiss_index = faiss.IndexIVFFlat(quantizer, self.max_tfidf_features, self.num_faiss_cells) self.faiss_index.train(self.dense_matrix.astype(np.float32)) else: self.faiss_index = faiss.IndexFlatIP(self.max_tfidf_features) if self.use_gpu: res = faiss.StandardGpuResources() self.faiss_index = faiss.index_cpu_to_gpu( res, 0, self.faiss_index) self.faiss_index.add(self.dense_matrix.astype(np.float32)) self.save_vectorizers_data()
result = {} result["dim"] = d result["nb"] = nb result["k"] = k print(result) xb = np.random.random((nb, d)).astype('float32') xb[:, 0] += np.arange(nb) / 1000. xq = np.random.random((nq, d)).astype('float32') xq[:, 0] += np.arange(nq) / 1000. normalize_L2(xb) normalize_L2(xq) nlist = nb / 10000 quantizer = faiss.IndexFlatIP(d) # the other index index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) index.nprobe = nlist / 4 index.verbose = True assert not index.is_trained index.train(xb) assert index.is_trained index.add(xb) # add may be a bit slower as well spent = [] for i in range(100): start = datetime.datetime.now() D, I = index.search(xq[:1000], k) # actual search end = datetime.datetime.now() s = end - start spent.append(s.total_seconds()) result["IVF_avg_spent"] = sum(spent) / 100
def predict_topk(biosyn, eval_dictionary, eval_queries, topk, score_mode='hybrid', type_given=False): """ Parameters ---------- score_mode : str hybrid, dense, sparse """ encoder = biosyn.get_dense_encoder() tokenizer = biosyn.get_dense_tokenizer() sparse_encoder = biosyn.get_sparse_encoder() sparse_weight = biosyn.get_sparse_weight().item() # must be scalar value # useful if we're conditioning on types all_indv_types = [x for t in eval_dictionary[:, 1] for x in t.split('|')] unique_types = np.unique(all_indv_types).tolist() v_check_type = np.vectorize(check_label) inv_idx = { t: v_check_type(eval_dictionary[:, 1], t).nonzero()[0] for t in unique_types } # embed dictionary dict_sparse_embeds = biosyn.embed_sparse(names=eval_dictionary[:, 0], show_progress=True) dict_dense_embeds = biosyn.embed_dense(names=eval_dictionary[:, 0], show_progress=True) # build the sparse index if not type_given: sparse_index = nmslib.init(method='hnsw', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) sparse_index.addDataPointBatch(dict_sparse_embeds) sparse_index.createIndex({'post': 2}, print_progress=False) else: sparse_index = {} for sty, indices in inv_idx.items(): sparse_index[sty] = nmslib.init( method='hnsw', space='negdotprod_sparse_fast', data_type=nmslib.DataType.SPARSE_VECTOR) sparse_index[sty].addDataPointBatch(dict_sparse_embeds[indices]) sparse_index[sty].createIndex({'post': 2}, print_progress=False) # build the dense index d = dict_dense_embeds.shape[1] if not type_given: nembeds = dict_dense_embeds.shape[0] if nembeds < 10000: # if the number of embeddings is small, don't approximate dense_index = faiss.IndexFlatIP(d) dense_index.add(dict_dense_embeds) else: nlist = int(math.floor( math.sqrt(nembeds))) # number of quantized cells nprobe = int(math.floor( math.sqrt(nlist))) # number of the quantized cells to probe quantizer = faiss.IndexFlatIP(d) dense_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) dense_index.train(dict_dense_embeds) dense_index.add(dict_dense_embeds) dense_index.nprobe = nprobe else: dense_index = {} for sty, indices in inv_idx.items(): sty_dict_dense_embeds = dict_dense_embeds[indices] nembeds = sty_dict_dense_embeds.shape[0] if nembeds < 10000: # if the number of embeddings is small, don't approximate dense_index[sty] = faiss.IndexFlatIP(d) dense_index[sty].add(sty_dict_dense_embeds) else: nlist = int(math.floor( math.sqrt(nembeds))) # number of quantized cells nprobe = int(math.floor(math.sqrt( nlist))) # number of the quantized cells to probe quantizer = faiss.IndexFlatIP(d) dense_index[sty] = faiss.IndexIVFFlat( quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) dense_index[sty].train(sty_dict_dense_embeds) dense_index[sty].add(sty_dict_dense_embeds) dense_index[sty].nprobe = nprobe # respond to mention queries queries = [] for eval_query in tqdm(eval_queries, total=len(eval_queries)): mentions = eval_query[0].replace("+", "|").split("|") golden_cui = eval_query[1].replace("+", "|") golden_sty = eval_query[2].replace("+", "|") pmid = eval_query[3] start_char = eval_query[4] end_char = eval_query[5] dict_mentions = [] for mention in mentions: mention_sparse_embeds = biosyn.embed_sparse( names=np.array([mention])) mention_dense_embeds = biosyn.embed_dense( names=np.array([mention])) # search the sparse index if not type_given: sparse_nn = sparse_index.knnQueryBatch(mention_sparse_embeds, k=topk, num_threads=20) else: sparse_nn = sparse_index[golden_sty].knnQueryBatch( mention_sparse_embeds, k=topk, num_threads=20) sparse_idxs, _ = zip(*sparse_nn) s_candidate_idxs = np.asarray(sparse_idxs) if type_given: # reverse mask index mapping s_candidate_idxs = inv_idx[golden_sty][s_candidate_idxs] s_candidate_idxs = s_candidate_idxs.astype(np.int64) # search the dense index if not type_given: _, d_candidate_idxs = dense_index.search( mention_dense_embeds, topk) else: _, d_candidate_idxs = dense_index[golden_sty].search( mention_dense_embeds, topk) # reverse mask index mapping d_candidate_idxs = inv_idx[golden_sty][d_candidate_idxs] d_candidate_idxs = d_candidate_idxs.astype(np.int64) # get the reduced candidate set reduced_candidate_idxs = np.unique( np.hstack([ s_candidate_idxs.reshape(-1, ), d_candidate_idxs.reshape(-1, ) ])) # get score matrix sparse_score_matrix = biosyn.get_score_matrix( query_embeds=mention_sparse_embeds, dict_embeds=dict_sparse_embeds[ reduced_candidate_idxs, :]).todense() dense_score_matrix = biosyn.get_score_matrix( query_embeds=mention_dense_embeds, dict_embeds=dict_dense_embeds[reduced_candidate_idxs, :]) if score_mode == 'hybrid': score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix elif score_mode == 'dense': score_matrix = dense_score_matrix elif score_mode == 'sparse': score_matrix = sparse_score_matrix else: raise NotImplementedError() # take care of getting the best indices candidate_idxs = biosyn.retrieve_candidate( score_matrix=score_matrix, topk=topk) candidate_idxs = reduced_candidate_idxs[candidate_idxs] np_candidates = eval_dictionary[candidate_idxs].squeeze() dict_candidates = [] for np_candidate in np_candidates: dict_candidates.append({ 'name': np_candidate[0], 'sty': np_candidate[1], 'cui': np_candidate[2], 'label': check_label(np_candidate[2], golden_cui) }) dict_mentions.append({ 'mention': mention, 'golden_cui': golden_cui, # golden_cui can be composite cui 'pmid': pmid, 'start_char': start_char, 'end_char': end_char, 'candidates': dict_candidates }) queries.append({'mentions': dict_mentions}) result = {'queries': queries} return result
df.drop_duplicates(inplace=True, subset=["description"]) top_k_hits = 3 df.dropna(inplace=True, subset=["description"]) model = SentenceTransformer( 'bert-base-nli-stsb-mean-tokens') # BERT model fine tuned on STS dataset embedding_size = 768 # Size of embeddings of each book description top_k = 3 # Number of similarity matchings to output embedding_cache_path = "data.pkl" num_clusters = 200 # Define FAISS quantizer = faiss.IndexFlatIP(embedding_size) index = faiss.IndexIVFFlat(quantizer, embedding_size, num_clusters, faiss.METRIC_INNER_PRODUCT) index.nprobe = 3 if not os.path.exists(embedding_cache_path): descriptions = [] titles = [] isbn13 = [] isbn = [] for row in df.itertuples(): descriptions.append(row.description) titles.append(row.title) isbn13.append(row.isbn13) isbn.append(row.isbn)
def IVFFlatGpu(config): print("IVFFlatGpu, ", config) d = config['dimension'] # dimension nb = config['db_size'] # database size nq = config['query_num'] # nb of queries topk = config['top_k'] nlist = config['nlist'] nprobe = config['nprobe'] search_repeat = 10 res = faiss.StandardGpuResources() # use a single GPU # temp memory if config["temp_memory"] == 0: res.noTempMemory() elif config["temp_memory"] != -1: res.setTempMemory(config["temp_memory"] * 1024 * 1024) index_list = [] create_ave_duration = 0 search_ave_duration = 0 if config['test_batch_write'] == True: batch_write_ave_duration = 0 batch_write_num = config['write_batch_num'] batch_write_time = int(nb / config['write_batch_num']) print("batch_write_time = ", batch_write_num) for i in range(config['db_num']): # Using an IVF index quantizer = faiss.IndexFlatL2(d) # the other index index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf) batch_write_ave_one_lib = 0 for j in range(batch_write_time): np.random.seed(i * batch_write_time + j) xb = np.random.random((batch_write_num, d)).astype('float32') xb[:, 0] += np.arange(batch_write_num) / 1000. begin_time = time.time() if gpu_index_ivf.is_trained == False: print("train, j=", j) gpu_index_ivf.train(xb) gpu_index_ivf.add(xb) duration = time.time() - begin_time batch_write_ave_one_lib += duration batch_write_ave_duration += duration print(i, ",batch_write_ave_one_lib = ", (batch_write_ave_one_lib / batch_write_time) * 1000 * 1000, " us") index_list.append(index_ivf) print("batch_write_ave_duration = ", (batch_write_ave_duration / len(index_list) / batch_write_time) * 1000 * 1000, " us") return index_list for i in range(config['db_num']): np.random.seed(i) # make reproducible xb = np.random.random((nb, d)).astype('float32') xb[:, 0] += np.arange(nb) / 1000. begin_time = time.time() quantizer = faiss.IndexFlatL2(d) # the other index index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) # here we specify METRIC_L2, by default it performs inner-product search # make it an IVF GPU index gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf) assert not gpu_index_ivf.is_trained gpu_index_ivf.train(xb) # add vectors to the index assert gpu_index_ivf.is_trained gpu_index_ivf.add(xb) # add vectors to the index gpu_index_ivf.nprobe = nprobe duration = time.time() - begin_time create_ave_duration += duration index_list.append(gpu_index_ivf) if i == 0: gpu_index_ivf.search(xb[:5], 4) print("craete ave duration = ", create_ave_duration / len(index_list), " s") if len(index_list) == 0: return index_list for i in range(len(index_list)): for j in range(search_repeat): np.random.seed(i * search_repeat + j + config['db_num']) xq = np.random.random((nq, d)).astype('float32') xq[:, 0] += np.arange(nq) / 1000. begin_time = time.time() index_list[i].search(xq, topk) # actual search duration = time.time() - begin_time search_ave_duration += duration print("search index aver time = ", search_ave_duration / len(index_list) / search_repeat, " s") return index_list
def build_index(self, sentences_or_file_path: Union[str, List[str]], use_faiss: bool = None, faiss_fast: bool = False, device: str = None, batch_size: int = 64): if use_faiss is None or use_faiss: try: import faiss assert hasattr(faiss, "IndexFlatIP") use_faiss = True except: logger.warning( "Fail to import faiss. If you want to use faiss, install faiss through PyPI. Now the program continues with brute force search." ) use_faiss = False # if the input sentence is a string, we assume it's the path of file that stores various sentences if isinstance(sentences_or_file_path, str): sentences = [] with open(sentences_or_file_path, "r") as f: logging.info("Loading sentences from %s ..." % (sentences_or_file_path)) for line in tqdm(f): sentences.append(line.rstrip()) sentences_or_file_path = sentences logger.info("Encoding embeddings for sentences...") embeddings = self.encode(sentences_or_file_path, device=device, batch_size=batch_size, normalize_to_unit=True, return_numpy=True) logger.info("Building index...") self.index = {"sentences": sentences_or_file_path} if use_faiss: quantizer = faiss.IndexFlatIP(embeddings.shape[1]) if faiss_fast: index = faiss.IndexIVFFlat( quantizer, embeddings.shape[1], min(self.num_cells, len(sentences_or_file_path))) else: index = quantizer if (self.device == "cuda" and device != "cpu") or device == "cuda": if hasattr(faiss, "StandardGpuResources"): logger.info("Use GPU-version faiss") res = faiss.StandardGpuResources() res.setTempMemory(20 * 1024 * 1024 * 1024) index = faiss.index_cpu_to_gpu(res, 0, index) else: logger.info("Use CPU-version faiss") else: logger.info("Use CPU-version faiss") if faiss_fast: index.train(embeddings.astype(np.float32)) index.add(embeddings.astype(np.float32)) index.nprobe = min(self.num_cells_in_search, len(sentences_or_file_path)) self.is_faiss_index = True else: index = embeddings self.is_faiss_index = False self.index["index"] = index logger.info("Finished")
def train_faiss(item_vector): quantizer = faiss.IndexFlatL2(item_vector.shape[1]) index = faiss.IndexIVFFlat(quantizer, item_vector.shape[1], 80) index.train(item_vector) index.add(item_vector) return index
"hdfs://localhost:9000/database_embeddings/*", recursiveFileLookup=True).select("features").toJSON().collect() # writing databases to database_vector.pkl list_all_db_vectors = [] list_all_db_vectors_index = [] i = 0 for data in databases: data = json.loads(data)["features"] data = get_features(data, "database") data_index = [i] * len(data) list_all_db_vectors += data list_all_db_vectors_index.append(data_index) list_all_db_vectors = np.asarray(list_all_db_vectors).astype('float32') list_all_db_vectors = normalize(list_all_db_vectors, axis=1, norm='l2') list_all_db_vectors_index = flatten(list_all_db_vectors_index) with open("database_vector_index.pkl", "wb") as f: pickle.dump(list_all_db_vectors_index, f) # initializing database vectors # with open("database_vector.pkl",'rb') as f : # database_vector = pickle.load(f) # initialization for FAISS algorithm cluster = NUMBER_OF_CLUSTER dimension = list_all_db_vectors[0].shape[0] quantiser = faiss.IndexFlatIP(dimension) index = faiss.IndexIVFFlat(quantiser, dimension, cluster, faiss.METRIC_INNER_PRODUCT) # training index on database vectors index.train(list_all_db_vectors) index.add(list_all_db_vectors) faiss.write_index(index, "database_faiss.index")
def search(q, query_type): videos = findVideos(q) timings = {} for each_video in videos: df_embeddings = pd.read_csv("data/" + str(each_video) + '/person_embeddings_mapping.csv', sep='\t') cols = list(df_embeddings) cols.insert(0, cols.pop(cols.index('Embeddings'))) df_embeddings = df_embeddings.ix[:, cols] x = str(df_embeddings['Embeddings'].tolist()).replace("\'", "") x = ast.literal_eval(x) y = numpy.array(x) y = y.astype('float32') d = 128 nlist = 1 k = 1 quantizer = faiss.IndexFlatL2(d) # the other index index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) index.train(y) # t and y ma k farak cha? index.add(y) # add may be a bit slower as well D, I = index.search(q, k) # actual search pos = [0] * len(I) p = [0] * len(I) # if face is not present: then add to the list if I == [[]]: print("Not found") else: for i in range(len(I)): pos[i] = I[i][0] p[i] = df_embeddings.iloc[pos[i], 1] df_person_bitmap = pd.read_csv("data/" + str(each_video) + '/person_bitmap_vector.csv', sep='\t') person_bitmap = [0] * len(p) for i in range(len(p)): person_bitmap[i] = df_person_bitmap.loc[ df_person_bitmap['person_label'] == int( p[i])]['BitMap'].values[0] person_bitmap[i] = str(person_bitmap[i]).replace("\'", "") person_bitmap[i] = json.loads(person_bitmap[i]) person_bitmap[i] = numpy.array(person_bitmap[i]) if query_type == 'next': timings[each_video] = next(person_bitmap[0], person_bitmap[1]) if query_type == 'eventually': timings[each_video] = eventually(person_bitmap[0], person_bitmap[1]) if query_type == 'is_before': timings[each_video] = is_a_before_b(person_bitmap[0], person_bitmap[1]) if query_type == 'interval': timings[each_video] = interval(person_bitmap) return timings
def __init__(self) -> None: quantizer = faiss.IndexFlatL2(self.d) self.index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist, faiss.METRIC_L2) self.index.nprobe = self.nprobe
def create_index(dim: int = 512, cells: int = 100): return faiss.IndexIVFFlat(faiss.IndexFlatL2(dim), dim, cells)
def main(sys): np.seterr(over='ignore') m = len(sys) print ("The script has the name %s" % (sys[0])) print("initiate: %s " % (sys[0])) print ("Number of arguments: ", m, " arguments.") input_file_dataset = sys[1] input_file_queries = sys[2] k = int(sys[3]) var = sys[4] run = sys[5] ground_truth_D = sys[6] ground_truth_I = sys[7] error = float(sys[8]) nlist = int(sys[9]) #number of clusters nprobe = int(sys[10]) #how many times repeat search print("check of the arguments") for i in range(m): print("arguments: %s " % (sys[i])) dataset = os.path.realpath(input_file_dataset) queryset = os.path.realpath(input_file_queries) groundtruth_D = os.path.realpath(ground_truth_D) groundtruth_I = os.path.realpath(ground_truth_I) #ground_truth = os.path.realpath(output_file_gt) a_vectors = np.loadtxt(dataset).astype(np.float32) query_set = np.loadtxt(queryset).astype(np.float32) GT_D = np.loadtxt(groundtruth_D).astype(np.float32) GT_I = np.loadtxt(groundtruth_I).astype(np.float32) n_db = len(a_vectors) d = len(a_vectors[0]) #dimension of database n_q = len(query_set) # nlist = int(len(a_vectors) / k) #number of clusters # nprobe = int((k/2)+1) #how many times repeat search print("check of dimensions") print("param n_db", n_db) print("param d", d) print("param k", k) print("param n_q", n_q) print("param nlist", nlist) print("param nprobe", nprobe) print("param error", error) print("faiss ...") start1 = time.clock() quantizer = faiss.IndexFlatL2(d) # build the index index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) assert not index.is_trained index.train(a_vectors) assert index.is_trained index.add( a_vectors) stop1 = time.clock() start2 = time.clock() index.nprobe = nprobe # if set to nlist = same as brute force, part of autotuning performance - speed accuracy tradeoff D, I = index.search(query_set, k) # actual search stop2 = time.clock() #run recall recall_i = recall_similar_match( GT_I, I) recall_d = recall_with_error( GT_D, D, error) stringname_D = 'D' + sys[0][1:7] + '_' + var +'.txt' stringname_I = 'I' + sys[0][1:7] + '_' + var +'.txt' np.savetxt(stringname_D, D) np.savetxt(stringname_I, I) time1 = stop1 - start1 time2 = stop2 - start2 #run, filename, index_time, build_time, recall_D, recall_I, n_db, n_q, d, k print_time(run, sys[0], time1, time2, recall_d, recall_i, n_db, n_q, d, k, error) print("finish")
def create_faiss(self): quantiser = faiss.IndexFlatL2(self.num_dimensions) self.faiss_params = {} if self.similarity_metric == 'cosine': self.faiss_params['preprocess_opt'] = 'norm' self.faiss_params['metric'] = faiss.METRIC_L2 elif self.similarity_metric == 'inner': self.faiss_params['preprocess_opt'] = 'false' self.faiss_params['metric'] = faiss.METRIC_INNER_PRODUCT elif self.similarity_metric == 'euclidean': self.faiss_params['preprocess_opt'] = 'false' self.faiss_params['metric'] = faiss.METRIC_L2 elif self.similarity_metric == 'mahalanobis': self.faiss_params['preprocess_opt'] = 'covar' self.faiss_params['metric'] = faiss.METRIC_L2 self.faiss_indices = { 'title': faiss.IndexIVFFlat(quantiser, self.num_dimensions, self.num_centroids, self.faiss_params['metric']), 'abstract': faiss.IndexIVFFlat(quantiser, self.num_dimensions, self.num_centroids, self.faiss_params['metric']), 'body': faiss.IndexIVFFlat(quantiser, self.num_dimensions, self.num_centroids, self.faiss_params['metric']), } # Title train self.lookup_titles = {} documents_with_titles = [] self.lookup_abstracts = {} documents_with_abstracts = [] self.lookup_bodies = {} documents_with_bodies = [] for doc in self.documents: v_title = doc.mean_vector_title() v_abstract = doc.mean_vector('abstract') v_body = doc.mean_vector('body') if not (v_title is None or len(v_title.shape) != 1 and v_title.shape[0] != self.num_dimensions): self.lookup_titles[len(documents_with_titles)] = doc documents_with_titles.append(v_title) if not (v_abstract is None or len(v_abstract.shape) != 1 and v_abstract.shape[0] != self.num_dimensions): self.lookup_abstracts[len(documents_with_abstracts)] = doc documents_with_abstracts.append(v_abstract) if not (v_body is None or len(v_body.shape) != 1 and v_body.shape[0] != self.num_dimensions): self.lookup_bodies[len(documents_with_bodies)] = doc documents_with_bodies.append(v_body) vectors = self.search_preprocess(np.stack(documents_with_titles, axis=0), is_train=True) self.faiss_indices['title'].train(vectors) self.faiss_indices['title'].add(vectors) # Train abstract vectors = self.search_preprocess(np.stack(documents_with_abstracts, axis=0), is_train=True) self.faiss_indices['abstract'].train(vectors) self.faiss_indices['abstract'].add(vectors) # Train full vectors = self.search_preprocess(np.stack(documents_with_bodies, axis=0), is_train=True) self.faiss_indices['body'].train(vectors) self.faiss_indices['body'].add(vectors)
# define msg queue names msg_queues = {} msg_queues['crawler_notify'] = "CRAWLER_NOTIFY_QUE" msg_queues['feature_detect'] = "FEATURE_DETECT_QUE" msg_queues['detect_finish'] = "FEATURE_DETECT_FINISH_QUE" # feature detect related config static_image_feature_dir = "/Users/tonyyoung/test/feature/inception/image" animated_image_feature_dir = "/Users/tonyyoung/test/feature/600" static_dimension = 2048 animaed_dimension = 600 nlist = 20 # Number of clustering centers quantizer_static = faiss.IndexFlatL2(static_dimension) quantizer_animaed = faiss.IndexFlatL2(animaed_dimension) static_image_index = faiss.IndexIVFFlat(quantizer_static, static_dimension, nlist, faiss.METRIC_L2) animated_image_index = faiss.IndexIVFFlat(quantizer_animaed, animaed_dimension, nlist, faiss.METRIC_L2) '''static_image_index = faiss.IndexFlatL2(static_dimension) animated_image_index = faiss.IndexFlatL2(animaed_dimension)''' duplicate_threshold = 10 # OSS account info. clientId = 'LTAIW5NjZnlwWIjr' clientSecret = 'BWajgSlWW32EtuQbTmDywvSf7pvwuj' MEDIA_ROOT = "/home/ubuntu/workspace/smile_sv/smile/media" # ========================= Redis operations ======================== '''sticker_md5 = "sticker_md5"
def __init__(self, dim=VECTOR_DIMENSION): # if isfile(save_path): # self._index = faiss.read_index(save_path) # else: quantizer = faiss.IndexFlatL2(dim) self._index = faiss.IndexIVFFlat(quantizer, dim, N_LIST, faiss.METRIC_L2)
def ivf_search(): ''' IndexIVFFlat 倒排搜索 基于 直接搜索 要求:行需大于等于 行分割数nlist 优点:减少计算量速度提升明显,5百万8G 0.5秒 缺点:丢了点精度,数据倒排耗时 通过使用IndexIVFFlat索引,将数据集分割成多个, 我们在d维空间中定义Voronoi单元,每个数据库向量落在其中一个单元格中。在搜索时,只有查询x所在的单元格中包含的数据库向量y和几个相邻的数据库向量y与查询向量进行比较 :return: ''' nlist = 4 # 行分割数-行分割成的单元格数,分割数越多分割越耗时 index = faiss.IndexFlatL2(d) iv_index = faiss.IndexIVFFlat(index, d, nlist, faiss.METRIC_L2) if not iv_index.is_trained: iv_index.train(xb) # train xb iv_index.add(xb) # 添加xb # 动态新增数据 xb1 = np.random.random((1, d)).astype('float32') xb1[:, 0] += np.arange(1) / 1000. xb1[0][0] -= 1 print(xb1) iv_index.train(xb1) # 需要训练 iv_index.add(xb1) xb2 = np.ones((1, d)).astype('float32') iv_index.train(xb2) iv_index.add(xb2) iv_index.nprobe = 3 # 搜索访问的单元格数,访问越大越精确越耗时 :既 128维倒排切割成4份,只计算3份的维度相似 start = time.time() print(type(xq)) D, I = iv_index.search(xq, K) print('ivf_search_time', time.time() - start) print('ivf_search_相似索引', I) print('ivf_search_相似值', D) xq3 = np.ones((1, d)).astype('float32') xq3[:, 0] += np.arange(1) / 1000. D, I = iv_index.search(xq3, K) print('ivf_search_相似索引', I) print('ivf_search_相似值', D) vectors = MyRecognition.face_descriptors( img_path= '/home/stringk/PycharmProjects/pytorchDemo/demos/face_recognition_demo/img/0/0image0.jpg' ) for v in vectors: nv = np.array([v], dtype='float32') print(nv) print(type(nv)) print(nv[0][-1]) iv_index.train(nv) iv_index.add(nv) D, I = iv_index.search(nv, K) print('ivf_search_相似索引', I) print('ivf_search_相似值', D)
def build_index(cfg: DictConfig, model: object): """ Builds faiss index from index dataset specified in the config. Args: cfg (DictConfig): Config file specifying index parameters model (object): Encoder model """ # Get index dataset embeddings # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them if cfg.apply_pca and os.path.isfile( cfg.pca.pca_save_name) and os.path.isfile( cfg.pca_embeddings_save_name): logging.info("Loading reduced dimensionality embeddings") embeddings = h5py.File(cfg.pca_embeddings_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] elif os.path.isfile(cfg.embedding_save_name): logging.info("Loading previously extracted index dataset embeddings") embeddings = h5py.File(cfg.embedding_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] else: logging.info("Encoding index dataset, this may take a while") index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True) embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model) # Create pca model to reduce dimensionality of index dataset and decrease memory footprint if cfg.apply_pca: # Need to train PCA model and apply PCA transformation with newly trained model if not os.path.isfile(cfg.pca.pca_save_name): logging.info( "Fitting PCA model for embedding dimensionality reduction") pca_train_set = random.sample( list(embeddings), k=int(len(embeddings) * cfg.pca.sample_fraction)) pca = PCA(n_components=cfg.pca.output_dim) pca.fit(pca_train_set) pkl.dump(pca, open(cfg.pca.pca_save_name, "wb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) # PCA model already trained, just need to reduce dimensionality of all embeddings elif not os.path.isfile(cfg.pca_embeddings_save_name): pca = pkl.load(open(cfg.pca.pca_save_name, "rb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) # Build faiss index from embeddings logging.info( f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus" ) quantizer = faiss.IndexFlatL2(cfg.dims) index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist) index = faiss.index_cpu_to_all_gpus(index) index.train(embeddings) logging.info("Adding dataset embeddings to index") for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)): index.add(embeddings[i:i + cfg.index_batch_size]) logging.info("Saving index") faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name) logging.info("Index built and saved")
# features_astype = features.astype(np.float32) # mat = faiss.PCAMatrix (1024, 128) # mat.train(features_astype) # assert mat.is_trained # features_shape = mat.apply_py(features_astype) # print(features_shape.shape) # np.savetxt('PCA_features.txt',features_shape) print(features_shape.shape) dimension = 1024 n = 95276 nlist = 50 quantiser = faiss.IndexFlatL2(dimension) index = faiss.IndexIVFFlat(quantiser, dimension, nlist, faiss.METRIC_L2) # print(index.is_trained) index.train(features_shape) # print(index.ntotal) index.add(features_shape) # print(index.is_trained) # print(index.ntotal) nprobe = 10 # find 2 most similar clusters k = 5 # return 3 nearest neighbours a = np.reshape(features_shape[1], (1, -1)) distances, indices = index.search(a, k) print(distances) print(indices)
def serve(args): # serve_demo: Load saved embeddings, serve question model. question in, results out. # serve_question: only serve question model. question in, vector out. # serve_context: only serve context model. context in, phrase-vector pairs out. # serve: serve all three. device = torch.device('cuda' if args.cuda else 'cpu') pprint(args.__dict__) interface = FileInterface(**args.__dict__) # use cache for metadata if args.cache: out = interface.cache(preprocess, args) processor = out['processor'] processed_metadata = out['processed_metadata'] else: processor = Processor(**args.__dict__) metadata = interface.load_metadata() processed_metadata = processor.process_metadata(metadata) model = Model(**args.__dict__).to(device) model.init(processed_metadata) interface.bind(processor, model) interface.load(args.iteration, session=args.load_dir) with torch.no_grad(): model.eval() if args.mode == 'serve_demo': phrases = [] paras = [] results = [] embs = [] idxs = [] iterator = interface.context_load(metadata=True, emb_type=args.emb_type) for _, (cur_phrases, each_emb, metadata) in zip(range(args.num_train_mats), iterator): embs.append(each_emb) phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) idxs.append(len(idxs)) paras.append(metadata['context']) if args.emb_type == 'dense': import faiss emb = np.concatenate(embs, 0) d = 4 * args.hidden_size * args.num_heads if args.metric == 'ip': quantizer = faiss.IndexFlatIP(d) # Exact Search elif args.metric == 'l2': quantizer = faiss.IndexFlatL2(d) else: raise ValueError() if args.nlist != args.nprobe: # Approximate Search. nlist > nprobe makes it faster and less accurate if args.bpv is None: if args.metric == 'ip': search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist, faiss.METRIC_INNER_PRODUCT) elif args.metric == 'l2': search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist) else: raise ValueError() else: assert args.metric == 'l2' # only l2 is supported for product quantization search_index = faiss.IndexIVFPQ(quantizer, d, args.nlist, args.bpv, 8) search_index.train(emb) else: search_index = quantizer search_index.add(emb) for cur_phrases, each_emb, metadata in iterator: phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) paras.append(metadata['context']) search_index.add(each_emb) if args.nlist != args.nprobe: search_index.nprobe = args.nprobe def search(emb, k): D, I = search_index.search(emb, k) return D[0], I[0] elif args.emb_type == 'sparse': assert args.metric == 'l2' # currently only l2 is supported (couldn't find a good ip library) import pysparnn.cluster_index as ci cp = ci.MultiClusterIndex(embs, idxs) for cur_phrases, each_emb, metadata in iterator: phrases.extend(cur_phrases) for span in metadata['answer_spans']: results.append([len(paras), span[0], span[1]]) paras.append(metadata['context']) for each_vec in each_emb: cp.insert(each_vec, len(idxs)) idxs.append(len(idxs)) def search(emb, k): return zip(*[each[0] for each in cp.search(emb, k=k)]) else: raise ValueError() def retrieve(question, k): example = {'question': question, 'id': 'real', 'idx': 0} dataset = (processor.preprocess(example), ) loader = DataLoader(dataset, batch_size=1, collate_fn=processor.collate) batch = next(iter(loader)) question_output = model.get_question(**batch) question_results = processor.postprocess_question_batch(dataset, batch, question_output) id_, emb = question_results[0] D, I = search(emb, k) out = [(paras[results[i][0]], results[i][1], results[i][2], '%.4r' % d.item(),) for d, i in zip(D, I)] return out if args.mem_info: import psutil import os pid = os.getpid() py = psutil.Process(pid) info = py.memory_info()[0] / 2. ** 30 print('Memory Use: %.2f GB' % info) # Demo server. Requires flask and tornado from flask import Flask, request, jsonify from flask_cors import CORS from tornado.wsgi import WSGIContainer from tornado.httpserver import HTTPServer from tornado.ioloop import IOLoop app = Flask(__name__, static_url_path='/static') app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False CORS(app) @app.route('/') def index(): return app.send_static_file('index.html') @app.route('/files/<path:path>') def static_files(path): return app.send_static_file('files/' + path) @app.route('/api', methods=['GET']) def api(): query = request.args['query'] out = retrieve(query, 5) return jsonify(out) print('Starting server at %d' % args.port) http_server = HTTPServer(WSGIContainer(app)) http_server.listen(args.port) IOLoop.instance().start()
def test_IndexIVF_2(self): index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 10) index.train(xt) index.add(xb)
def _build_query_ann_index( charge: int, mz_splits: np.ndarray, vectorize: Callable, n_probe: int, batch_size: int, n_neighbors: int, n_neighbors_ann: int, precursor_tol_mass: float, precursor_tol_mode: str, distances: np.ndarray, indices: np.ndarray, indptr: np.ndarray, work_dir: str) -> pd.DataFrame: """ Create ANN index(es) for spectra with the given charge per precursor m/z split. Parameters ---------- charge : int Precursor charge of the spectra to be processed. mz_splits : np.ndarray M/z splits used to create separate ANN indexes. vectorize : Callable Function to convert the spectra to vectors. batch_size : int The number of vectors to be simultaneously added to the index. work_dir : str Directory to read and store (intermediate) results. Returns ------- pd.DataFrame Metadata (identifier, precursor charge, precursor m/z) of the spectra for which indexes were built. """ identifiers, precursor_mzs = [], [] indptr_i = 0 # Find neighbors per specified precursor m/z interval. for mz in tqdm.tqdm(mz_splits, desc='Intervals queried', unit='index'): pkl_filename = os.path.join(work_dir, 'spectra', f'{charge}_{mz}.pkl') if not os.path.isfile(pkl_filename): continue # Read the spectra for the m/z split. with open(pkl_filename, 'rb') as f_in: spectra_split = pickle.load(f_in) precursor_mzs_split = [] for spec in spectra_split: identifiers.append(spec.identifier) precursor_mzs_split.append(spec.precursor_mz) precursor_mzs.append(np.asarray(precursor_mzs_split)) # Convert the spectra to vectors. vectors_split = vectorize(spectra_split) n_split, dim = len(spectra_split), vectors_split.shape[1] # Figure out a decent value for the n_list hyperparameter based on # the number of vectors. # Rules of thumb from the Faiss wiki: # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset if n_split == 0: continue if n_split < 10e2: # Use a brute-force index instead of an ANN index when there # are only a few items. n_list = -1 elif n_split < 10e5: n_list = 2**math.floor(math.log2(n_split / 39)) elif n_split < 10e6: n_list = 2**16 elif n_split < 10e7: n_list = 2**18 else: n_list = 2**20 if n_split > 10e8: logger.warning('More than 1B vectors to be indexed, consider ' 'decreasing the ANN size') # Create an ANN index using the inner product (proxy for cosine # distance) for fast NN queries. if n_list <= 0: index = faiss.IndexIDMap(faiss.IndexFlatIP(dim)) else: index = faiss.IndexIVFFlat(faiss.IndexFlatIP(dim), dim, n_list, faiss.METRIC_INNER_PRODUCT) index.nprobe = min(math.ceil(index.nlist / 8), n_probe) # Compute cluster centroids. # noinspection PyArgumentList index.train(vectors_split) # Add the vectors to the index in batches. batch_size = min(n_split, batch_size) for batch_start in range(0, n_split, batch_size): batch_stop = min(batch_start + batch_size, n_split) # noinspection PyArgumentList index.add_with_ids(vectors_split[batch_start:batch_stop], np.arange(batch_start, batch_stop)) # Query the index to calculate NN distances. _dist_mz_interval( index, vectors_split, precursor_mzs[-1], batch_size, n_neighbors, n_neighbors_ann, precursor_tol_mass, precursor_tol_mode, distances, indices, indptr, indptr_i) indptr_i += vectors_split.shape[0] return pd.DataFrame({'identifier': identifiers, 'precursor_charge': charge, 'precursor_mz': np.hstack(precursor_mzs)})
def main(sys): np.seterr(over='ignore') m = len(sys) print("The script has the name %s" % (sys[0])) print("initiate: %s " % (sys[0])) print("Number of arguments: ", m, " arguments.") input_file_dataset = sys[1] input_file_queries = sys[2] k = int(sys[3]) var = sys[4] run = sys[5] ground_truth_D = sys[6] ground_truth_I = sys[7] error = float(sys[8]) nlist = int(sys[9]) #number of clusters nprobe = int(sys[10]) #how many times repeat search print("check of the arguments") for i in range(m): print("arguments: %s " % (sys[i])) dataset = os.path.realpath(input_file_dataset) queryset = os.path.realpath(input_file_queries) groundtruth_D = os.path.realpath(ground_truth_D) groundtruth_I = os.path.realpath(ground_truth_I) #ground_truth = os.path.realpath(output_file_gt) a_vectors = np.loadtxt(dataset).astype(np.float32) query_set = np.loadtxt(queryset).astype(np.float32) GT_D = np.loadtxt(groundtruth_D).astype(np.float32) GT_I = np.loadtxt(groundtruth_I).astype(np.float32) # # k = len(GT_D[0]) n_db = len(a_vectors) d = len(a_vectors[0]) #dimension of database n_q = len(query_set) fo = len(a_vectors) # nlist = int(float(fo) / k) #number of clusters # nprobe = int((k/2)+1) #how many times repeat search print("check of dimensions") print("param n_db", n_db) print("param d", d) print("param k", k) print("param n_q", n_q) print("param nlist", nlist) print("param nprobe", nprobe) print("param error", error) print("faiss ...") start1 = time.clock() quantizer = faiss.IndexHNSWFlat(d, 32) index = faiss.IndexIVFFlat(quantizer, d, nlist) index.cp.min_points_per_centroid = 10 # quiet warning index.quantizer_trains_alone = 2 assert not index.is_trained index.train(a_vectors) assert index.is_trained quantizer.hnsw.efSearch = nprobe #setting up the precission, higger the better but slower (def. is 40) index.add(a_vectors) stop1 = time.clock() #----#start search start2 = time.clock() D, I = index.search(query_set, k) # actual search stop2 = time.clock() #---#end #run recall recall_i = recall_similar_match(GT_I, I) recall_d = recall_with_error(GT_D, D, error) stringname_D = 'D' + sys[0][1:7] + '_' + var + '.txt' stringname_I = 'I' + sys[0][1:7] + '_' + var + '.txt' np.savetxt(stringname_D, D) np.savetxt(stringname_I, I) time1 = stop1 - start1 time2 = stop2 - start2 #run, filename, index_time, build_time, recall_D, recall_I, n_db, n_q, d, k print_time(run, sys[0], time1, time2, recall_d, recall_i, n_db, n_q, d, k, error) print("finish")
import numpy as np import faiss # make faiss available if __name__ == "__main__": e = np.load('encodings.npy', allow_pickle=True).tolist() vectorEncoding = np.array([x.get('encondings') for x in e], dtype=np.float32) vectorId = np.array([x.get('id') for x in e]) d = len(vectorEncoding[0]) print('Tamanho d do vetor: {}'.format(d)) index = faiss.IndexIVFFlat(d) # build the index index2 = faiss.IndexIDMap(index) index2.add_with_ids(vectorEncoding, vectorId) print('Tamanho do indice: {}'.format(index2.ntotal)) faiss.write_index(index2, '10000.index')
print "add" # to see progress index.verbose = True index.add(xb) print "search" for efSearch in 16, 32, 64, 128, 256: print "efSearch", efSearch, index.hnsw.efSearch = efSearch evaluate(index) if 'ivf' in todo: print "Testing IVF Flat (baseline)" quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFFlat(quantizer, d, 16384) index.cp.min_points_per_centroid = 5 # quiet warning # to see progress index.verbose = True print "training" index.train(xt) print "add" index.add(xb) print "search" for nprobe in 1, 4, 16, 64, 256: print "nprobe", nprobe, index.nprobe = nprobe
"/home/jianx/results/passage_0__emb_p__data_obj_0.pb") query_train_embeddings = obj_reader( "/home/jianx/results/query_0__emb_p__data_obj_0.pb") query_train_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict") pid_mapping = obj_reader( "/datadrive/jianx/data/annoy/100_ance_passage_map.dict") print_message("Building index") faiss.omp_set_num_threads(16) dim = passage_embeddings.shape[1] if IS_FLAT: cpu_index = faiss.IndexFlatIP(dim) else: quantizer = faiss.IndexFlatIP(dim) cpu_index = faiss.IndexIVFFlat(quantizer, dim, NLIST) assert not cpu_index.is_trained cpu_index.train(passage_embeddings) assert cpu_index.is_trained cpu_index.add(passage_embeddings) print_message("Searching for all queries") with open(OUT_PATH, "w+") as f: print_message("Writing to {}".format(OUT_PATH)) f.write("") for starting in range(0, len(query_train_embeddings), BATCH_SIZE): mini_batch = query_train_embeddings[starting:starting + BATCH_SIZE] _, dev_I = cpu_index.search(mini_batch, RANK) print_message("Batch No.{}/{}".format( starting / BATCH_SIZE + 1, len(query_train_embeddings) / BATCH_SIZE))
def test_dedup(self): d = 10 nb = 1000 nq = 200 nt = 500 xt, xb, xq = get_dataset_2(d, nt, nb, nq) # introduce duplicates xb[500:900:2] = xb[501:901:2] xb[901::4] = xb[900::4] xb[902::4] = xb[900::4] xb[903::4] = xb[900::4] # also in the train set xt[201::2] = xt[200::2] quantizer = faiss.IndexFlatL2(d) index_new = faiss.IndexIVFFlatDedup(quantizer, d, 20) index_new.verbose = True # should display # IndexIVFFlatDedup::train: train on 350 points after dedup (was 500 points) index_new.train(xt) index_ref = faiss.IndexIVFFlat(quantizer, d, 20) assert index_ref.is_trained index_ref.nprobe = 5 index_ref.add(xb) index_new.nprobe = 5 index_new.add(xb) Dref, Iref = index_ref.search(xq, 20) Dnew, Inew = index_new.search(xq, 20) for i in range(nq): ref = self.normalize_res(Dref[i], Iref[i]) new = self.normalize_res(Dnew[i], Inew[i]) assert ref == new # test I/O _, tmpfile = tempfile.mkstemp() try: faiss.write_index(index_new, tmpfile) index_st = faiss.read_index(tmpfile) finally: if os.path.exists(tmpfile): os.unlink(tmpfile) Dst, Ist = index_st.search(xq, 20) for i in range(nq): new = self.normalize_res(Dnew[i], Inew[i]) st = self.normalize_res(Dst[i], Ist[i]) assert st == new # test remove toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950))) index_ref.remove_ids(toremove) index_new.remove_ids(toremove) Dref, Iref = index_ref.search(xq, 20) Dnew, Inew = index_new.search(xq, 20) for i in range(nq): ref = self.normalize_res(Dref[i], Iref[i]) new = self.normalize_res(Dnew[i], Inew[i]) assert ref == new
import faiss import numpy import numpy as np if __name__ == '__main__': nlist = 5 dimension = 512 quantizer = faiss.IndexFlatL2(dimension) index = faiss.IndexIVFFlat(quantizer, dimension, nlist) features_folder = "features/features.npy" image_features = np.load(features_folder).astype('float32') index.train(image_features) index.add(image_features) faiss.write_index(index, "image_index")