def annSearch(uvec,ivec,topk): emds=64 indextree = faiss.IndexFlatIP(emds) indextree.add(ivec) Dis,Index= indextree.search(np.array(uvec).astype('float32'),topk) return Dis,Index
def update_index(self): path = os.path.abspath( os.path.dirname(__file__) + '/../../var/model/esse/embeddings.npy') embeddings = self.sentence_encoder.encode( [esse.get_index_text() for esse in self.esses]) np.save(path, embeddings) self.index = faiss.IndexFlatIP(512) self.index.add(embeddings)
def load_train_data(): eli5 = datasets.load_dataset("eli5", name="LFQA_reddit") eli5_train = eli5["train_eli5"] eli5_train_q_reps = np.memmap( "eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128) ) eli5_train_q_index = faiss.IndexFlatIP(128) eli5_train_q_index.add(eli5_train_q_reps) return (eli5_train, eli5_train_q_index)
def faissLoadKmeans(KEMANS_PKL): kmeans = pickle.load(open(KMEANS_PKL,'rb')) vectors = np.array(kmeans.cluster_centers_).astype('float32') faissIndex = faiss.IndexFlatIP(DIM_BERT) faissIndex.add(vectors) return faissIndex
def __load_faiss_index(vectors: List[np.array], use_gpu: bool): vectors_dim = len(vectors[0]) vector_stack = np.stack(vectors) index = faiss.IndexFlatIP(vectors_dim) if use_gpu: res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, 0, index) index.add(vector_stack) return index
def build_index(self): """:returns an inverted index for the search documents""" vectors = [self.encode(document) for document in self.documents] index = faiss.IndexIDMap( faiss.IndexFlatIP(768)) # dimensionality of vector space # Add document vectors into index after transforming into numpy arrays. IDs should match len(documents) index.add_with_ids(np.array([vec.numpy() for vec in vectors]), np.array(range(0, len(self.documents)))) return index
def build_index(image_features): """ Builds index based on provided images' features for a fast access @param image_features: the features of images @return: Index """ index = faiss.IndexFlatIP(image_features.shape[1]) index.add(image_features) return index
def faiss_knn(Q, X, k, dist='IP'): d = X.shape[1] if dist == 'IP': index = faiss.IndexFlatIP(d) elif dist == 'L2': index = faiss.IndexFlatL2(d) index.add(X) dists, inds = index.search(Q, k) return dists, inds
def index_model(names, models, norm='l2'): ''' To normalize or not to normalize: - stats.stackexchange.com/questions/177905 - stackoverflow.com/questions/36034454 Usage: fp = f'{base}/models/{n}/nanotext_r89.model' model3 = load_embedding(fp) m3 = subtract_mean(model3) found, m, index = index_model(names, [m1, m2, m3], norm=norm) ''' import faiss import numpy as np from sklearn.preprocessing import normalize from nanotext.io import eprint m = [] found, notfound = [], 0 # first take mean of vectors ... for i in names: model_vv = [] try: for model in models: model_vv.append(model[i]) except KeyError: notfound += 1 continue sum_ = np.sum(model_vv, axis=0) / len(model_vv) found.append(i) m.append(sum_) # if only one model is present, this will return the original vector db = np.array(m, dtype='float32') dim = db.shape[1] # dimensions # ... then normalize if not norm: index = faiss.IndexFlatL2(dim) elif norm == 'l2': index = faiss.IndexFlatIP(dim) db = normalize(db, norm=norm, axis=1) # the inner product IP of two unit length vectors = cosine similarity else: raise ValueError('This norm is not supported, abort!') index.add(db) if notfound > 0: fraction = round(notfound / len(names), 4) eprint(f'{notfound} entries ({fraction}) not found.') return found, db, index
def validate(epoch, loader, imenc, capenc, vocab, args, SETTING): begin = time.time() print("begin validation for epoch {}".format(epoch), flush=True) dset = EmbedDset(loader, imenc, capenc, vocab, args) print("val dataset created | {} ".format(sec2str(time.time()-begin)), flush=True) im = dset.embedded["image"] cap = dset.embedded["caption"] nd = im.shape[0] nq = cap.shape[0] d = im.shape[1] cpu_index = faiss.IndexFlatIP(d) print("# images: {}, # captions: {}, dimension: {}".format(nd, nq, d), flush=True) # im2cap cpu_index.add(cap) # calculate every conbination and sort # D = result , I = imgid D, I = cpu_index.search(im, nq) data = {} allrank = [] # TODO: Make more efficient, do not hardcode 5 cap_per_image = 5 # brinf correct answer rank for each sentence(their are 5 each) for i in range(cap_per_image): gt = (np.arange(nd) * cap_per_image).reshape(-1, 1) + i rank = np.where(I == gt)[1] allrank.append(rank) allrank = np.stack(allrank) # minimal rank for ans(best of 5 each) allrank = np.amin(allrank, 0) # how many images were correct bellow @num for rank in [1, 5, 10, 20]: data["i2c_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank) data["i2c_median@r"] = np.median(allrank) + 1 data["i2c_mean@r"] = np.mean(allrank) # cap2im cpu_index.reset() cpu_index.add(im) D, I = cpu_index.search(cap, nd) # TODO: Make more efficient, do not hardcode 5 gt = np.arange(nq).reshape(-1, 1) // cap_per_image allrank = np.where(I == gt)[1] for rank in [1, 5, 10, 20]: data["c2i_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank) data["c2i_median@r"] = np.median(allrank) + 1 data["c2i_mean@r"] = np.mean(allrank) print("-"*50) print("results of cross-modal retrieval") for key, val in data.items(): print("{}: {}".format(key, val), flush=True) print("-"*50) return data
def __init__(self, args: Namespace, dim: int = 2048) -> None: self.data_dir = args.data_dir self.images_dir = args.images_dir with open(path.join(args.data_dir, args.captions)) as infile: self.captions = infile.readlines() self.embeddings = np.load(path.join(args.data_dir, args.embeddings)) self.k = k self.metric = metric if self.metric == -1: # Cosine similarity self.index = faiss.IndexFlatIP(dim) faiss.normalize_L2(self.embeddings) self.index.add(self.embeddings) elif self.metric == 1: # Euclidean distance (no square root) self.index = faiss.IndexFlatL2(dim) self.index.add(self.embeddings) elif self.metric == 23: # Mahalanobis distance self.index = faiss.IndexFlatL2(dim) x_centered = self.embeddings - self.embeddings.mean(0) self.transform = np.linalg.inv(np.linalg.cholesky( np.dot(x_centered.T, x_centered) / x_centered.shape[0])).T self.index.add( np.dot(self.embeddings, self.transform).astype(np.float32)) elif self.metric == 0: # Inner project self.index = faiss.IndexFlatIP(dim) self.index.add(self.embeddings) else: self.index = faiss.IndexFlat(dim, self.metric) self.index.add(self.embeddings) self.model = wide_resnet101_2(pretrained=True, progress=True) self.model.eval() # Don't forget to put model in evaluation mode! self.model.fc = Identity() # Use recommended sequence of transforms for ImageNet pretrained models self.transforms = Compose([Resize(256, interpolation=Image.BICUBIC), # Default is bilinear CenterCrop(224), ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
def __init__(self, X, method="inner_prod", *args, **kwargs): import faiss self.method = method if method == "inner_prod": self.index = faiss.IndexFlatIP(X.shape[1]) elif method == "l2": self.index = faiss.IndexFlatL2(X.shape[1]) else: raise NotImplementedError() self.index.add(X.astype(np.float32))
def create_index(vectors, dim=300): nlist = 5 nprobe = nlist quantizer = faiss.IndexFlatIP(dim) index = faiss.IndexIVFFlat(quantizer, dim, nlist, faiss.METRIC_INNER_PRODUCT) index.train(vectors) index.add(vectors) index.nprobe = nprobe return index, quantizer
def __init__(self, sentence_encoder: Optional[SentenceEncoder] = None): self.esses: List[Esse] = [] self.__load_esses() if sentence_encoder is None: sentence_encoder = SentenceEncoder() self.sentence_encoder: SentenceEncoder = sentence_encoder self.index: faiss.IndexFlatIP = faiss.IndexFlatIP(512) self.__load_index()
def get_faiss_ip_index(d=768, use_gpu=True): # build a Inner Product (CPU) index index_cpu = faiss.IndexFlatIP(d) if use_gpu: # claim single GPU resource resource = faiss.StandardGpuResources() # make it into a gpu index index_gpu = faiss.index_cpu_to_gpu(resource, 0, index_cpu) return index_gpu return index_cpu
def __build_index(self, index_dimension): if self.index_type is IndexType.L2_INDEX: log.debug("Building L2 index") index = faiss.IndexFlatL2(index_dimension) elif self.index_type is IndexType.COSINE_INDEX: log.debug("Building cosine index") index = faiss.IndexFlatIP(index_dimension) else: raise ValueError(f"Unknown index type {self.index_type}") self.__index = faiss.IndexIDMap(index)
def _init_index(self): quantizer = faiss.IndexFlatIP(self.dimension) if self.n_pq is None: self.index = faiss.IndexIVFFlat(quantizer, self.dimension, self.n_clusters) else: self.index = faiss.IndexIVFPQ(quantizer, self.dimension, self.n_clusters, self.n_pq, self.n_bytes) self.index.nprob = self.nprob
def get_faiss_index(faiss_index_path): if os.path.exists(faiss_index_path): faiss_index = faiss.read_index(faiss_index_path) print('Read faiss index from {}'.format(faiss_index_path)) return faiss_index else: faiss_index = faiss.IndexFlatIP(4096) faiss_index = faiss.IndexIDMap2(faiss_index) print('Creating new faiss index at {}'.format(faiss_index_path)) return faiss_index
def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_cluster_probe=3): start_time = time.time() if use_ann_search: print("Perform approx. kNN search") n_cluster = min(ann_num_clusters, int(y.shape[0]/1000)) quantizer = faiss.IndexFlatIP(y.shape[1]) index = faiss.IndexIVFFlat(quantizer, y.shape[1], n_cluster, faiss.METRIC_INNER_PRODUCT) index.nprobe = ann_num_cluster_probe index.train(y) index.add(y) sim, ind = index.search(x, k) else: print("Perform exact search") idx = faiss.IndexFlatIP(y.shape[1]) idx.add(y) sim, ind = idx.search(x, k) print("Done: {:.2f} sec".format(time.time()-start_time)) return sim, ind
def _get_faiss_index(self): # with Pool(cpu_count()) as p: # question_bert = p.map(eval, self.df["Q_FFNN_embeds"].tolist()) # answer_bert = p.map(eval, self.df["A_FFNN_embeds"].tolist()) question_bert = self.df["Q_FFNN_embeds"].tolist() self.df.drop(columns=["Q_FFNN_embeds"], inplace=True) answer_bert = self.df["A_FFNN_embeds"].tolist() self.df.drop(columns=["A_FFNN_embeds"], inplace=True) question_bert = np.array(question_bert, dtype='float32') answer_bert = np.array(answer_bert, dtype='float32') self.answer_index = faiss.IndexFlatIP(answer_bert.shape[-1]) self.question_index = faiss.IndexFlatIP(question_bert.shape[-1]) self.answer_index.add(answer_bert) self.question_index.add(question_bert) del answer_bert, question_bert
def __init__(self, vector_sz: int, buffer_size: int = 50000, index_factory_string: str = None): super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size) self.index = faiss.IndexFlatIP(vector_sz) if index_factory_string: self.index = faiss.index_factory(vector_sz, index_factory_string, faiss.METRIC_INNER_PRODUCT) self.index.nprobe = 32
def load_movie_vectors_into_faiss(self): """加载电影向量 到 faiss 中""" movie_output_vectors, movie_ids_index = self.tower_model_cls.get_movie_vectors( ) movie_output_vectors = movie_output_vectors.astype(np.float32) faiss_model = faiss.IndexFlatIP(self.tower_model_cls.dense_size) faiss_model.add(movie_output_vectors) return faiss_model, movie_ids_index
def train_faiss(item_feature, D): ''' use IndexFlatIP to get similar item ''' #construct the index index = faiss.IndexFlatIP(D) index.add(item_feature) print index.ntotal D, I = index.search(item_feature, 200) return I
def main(): print("loading X_PCA...") X_pca = joblib.load("../chapter09/X_PCA") t_index = joblib.load("../chapter09/t_index") faiss_pca = faiss.IndexFlatIP(300) faiss_pca.add(X_pca.astype('float32')) print("loading word2vec...") word_vectors = joblib.load("word_vectors") word_index = joblib.load("word_index") faiss_w2v = faiss.IndexFlatIP(300) faiss_w2v.add(word_vectors.astype('float32')) with open("family.txt") as f, open( "pca_family.txt", mode="w") as f_pca, open("w2v_family.txt", mode="w") as f_w2v: for line in map(lambda x: x.rstrip(), f): word = [""] * 3 word[0], word[1], word[2], *_ = line.split() try: # 類似ベクトル検索 # 9章のベクトル v1 = X_pca[t_index[word[0]]] v2 = X_pca[t_index[word[1]]] v3 = X_pca[t_index[word[2]]] vec_pca = v2 - v1 + v3 sim_num = faiss_pca.search( np.array([vec_pca]).astype('float32'), 1)[1][0][0] pred_word = list(t_index.keys())[sim_num] f_pca.write(f"{word[0]} {word[1]} {word[2]} {pred_word}\n") except: f_pca.write(f"{word[0]} {word[1]} {word[2]} -\n") try: # word2vec v1 = word_vectors[word_index[word[0]]] v2 = word_vectors[word_index[word[1]]] v3 = word_vectors[word_index[word[2]]] vec_w2v = v2 - v1 + v3 sim_num = faiss_w2v.search( np.array([vec_w2v]).astype('float32'), 1)[1][0][0] pred_word = list(word_index.keys())[sim_num] f_w2v.write(f"{word[0]} {word[1]} {word[2]} {pred_word}\n") except: f_w2v.write(f"{word[0]} {word[1]} {word[2]} -\n")
def setup_model_utils(): """ Loads the nlp SpaCy model. """ ModelUtils.nlp = spacy.load(Config.get_config("spacy_model_name_key")) ModelUtils.nlp.max_length = 10030000 ModelUtils.generate_embeddings_matrix() ModelUtils.index = faiss.IndexFlatIP(ModelUtils.dimensions) faiss.normalize_L2(ModelUtils.embeddings) ModelUtils.index.add(ModelUtils.embeddings)
def __init__(self, feats, k, index_path='', index_key='', nprobe=128, omp_num_threads=None, rebuild_index=True, verbose=True, **kwargs): import faiss if omp_num_threads is not None: faiss.omp_set_num_threads(omp_num_threads) self.verbose = verbose with Timer('[faiss] build index', verbose): if index_path != '' and not rebuild_index and os.path.exists( index_path): print('[faiss] read index from {}'.format(index_path)) index = faiss.read_index(index_path) else: feats = feats.astype('float32') size, dim = feats.shape index = faiss.IndexFlatIP(dim) if index_key != '': assert index_key.find( 'HNSW') < 0, 'HNSW returns distances insted of sims' metric = faiss.METRIC_INNER_PRODUCT nlist = min(4096, 8 * round(math.sqrt(size))) if index_key == 'IVF': quantizer = index index = faiss.IndexIVFFlat(quantizer, dim, nlist, metric) else: index = faiss.index_factory(dim, index_key, metric) if index_key.find('Flat') < 0: assert not index.is_trained index.train(feats) index.nprobe = min(nprobe, nlist) assert index.is_trained print('nlist: {}, nprobe: {}'.format(nlist, nprobe)) index.add(feats) if index_path != '': print('[faiss] save index to {}'.format(index_path)) mkdir_if_no_exists(index_path) faiss.write_index(index, index_path) with Timer('[faiss] query topk {}'.format(k), verbose): knn_ofn = index_path + '.npz' if os.path.exists(knn_ofn): print('[faiss] read knns from {}'.format(knn_ofn)) self.knns = np.load(knn_ofn)['data'] else: sims, nbrs = index.search(feats, k=k) self.knns = [(np.array(nbr, dtype=np.int32), 1 - np.array(sim, dtype=np.float32)) for nbr, sim in zip(nbrs, sims)]
def build_index(path, features: np.ndarray, train, normalize): if path is None: if normalize: features = features / ( (features**2).sum(axis=1, keepdims=True)**0.5) dim = features.shape[1] if not train: index = faiss.IndexFlatIP(dim) index.add(features) else: quantizer = faiss.IndexFlatIP(dim) num_clusters = 100 index = faiss.IndexIVFFlat(quantizer, dim, num_clusters, faiss.METRIC_INNER_PRODUCT) index.train(features) index.add(features) else: assert os.path.exists(path), f"{path} is not existed!" index = faiss.read_index(path) return index
def build_index(corpus_embedding, n_cluster=256, embedding_size=768, nprobe=4): quantizer = faiss.IndexFlatIP(embedding_size) index = faiss.IndexIVFFlat(quantizer, embedding_size, n_cluster, faiss.METRIC_INNER_PRODUCT) index.nprobe = nprobe corpus_embeddings = corpus_embedding / np.linalg.norm(corpus_embedding, axis=1)[:, None] index.train(corpus_embeddings) index.add(corpus_embeddings) return index
def __setstate__(self, newstate): embedding_space_dims = newstate[newstate['embedding_space_dims_name']] similarity_algorithm = newstate[newstate['similarity_algorithm_name']] index_np = newstate[newstate['index_np_name']] faiss_index = faiss.IndexFlatIP(embedding_space_dims) if similarity_algorithm == SimilarityAlgorithm.CosineSimilarity: # normalize with L2 as a proxy for cosine search faiss.normalize_L2(index_np) faiss_index.add(index_np) newstate[newstate['faiss_index_name']] = faiss_index self.__dict__.update(newstate)
def _sim_faiss(query_features, index_features, KNN): """Faissでsimilarity""" import faiss assert query_features.shape[1] == index_features.shape[1] dim = query_features.shape[1] Nq = query_features.shape[0] Nd = index_features.shape[0] index = faiss.IndexFlatIP(dim) index.add(index_features) D, I = index.search(query_features, KNN) return D, I, (Nq, Nd)