def characterize_dataset(model, sequences, entity2unique, entity2same, unique_text, nnlens): predictions = model.predict(sequences) t = AnnoyIndex( len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed t.set_seed(123) for i in range(len(predictions)): # print(predictions[i]) v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees for nnlen in nnlens: print("Characteristics at neighborhood length:" + str(nnlen)) pos_distances = [] neg_distances = [] match = 0 no_match = 0 for key in entity2same: index = entity2unique[key] nearest = t.get_nns_by_vector(predictions[index], nnlen) nearest_text = set([unique_text[i] for i in nearest]) expected_text = set(entity2same[key]) overlap = expected_text.intersection(nearest_text) m = len(overlap) match += m # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!) # make sure we adjust our estimate of no match appropriately no_match += min(len(expected_text), nnlen - 1) - m # annoy has this annoying habit of returning the queried item back as a nearest neighbor. Remove it. if key in nearest_text: nearest_text.remove(key) # sample only the negatives that are true negatives # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here' pos = expected_text neg = nearest_text - expected_text for i in pos: dist_pos = t.get_distance(index, entity2unique[i]) pos_distances.append(dist_pos) for i in neg: dist_neg = t.get_distance(index, entity2unique[i]) neg_distances.append(dist_neg) recall = match / (match + no_match) print("mean positive distance:" + str(statistics.mean(pos_distances))) print("stdev positive distance:" + str(statistics.stdev(pos_distances))) print("max positive distance:" + str(max(pos_distances))) print("mean neg distance:" + str(statistics.mean(neg_distances))) print("stdev neg distance:" + str(statistics.stdev(neg_distances))) print("max neg distance:" + str(max(neg_distances))) print("recall:" + str(recall))
def _get_knn_graph_annoy(X, n_neighbors=5, dist_metric='euclidean', random_seed=0): ''' Build k-nearest-neighbor graph Return edge list and nearest neighbor matrix ''' try: from annoy import AnnoyIndex except ImportError: raise ImportError('Please install the package "annoy". ' 'Alternatively, set `knn_method=\'umap\'.') npc = X.shape[1] ncell = X.shape[0] annoy_index = AnnoyIndex(npc, metric=dist_metric) annoy_index.set_seed(random_seed) for i in range(ncell): annoy_index.add_item(i, list(X[i, :])) annoy_index.build(10) # 10 trees knn = [] knn_dists = [] for iCell in range(ncell): neighbors, dists = annoy_index.get_nns_by_item(iCell, n_neighbors + 1, include_distances=True) knn.append(neighbors[1:]) knn_dists.append(dists[1:]) knn = np.array(knn, dtype=int) knn_dists = np.array(knn_dists) return knn, knn_dists
def annoy_build(df, id, metric='euclidean'): m = AnnoyIndex(VECTOR_SIZE, metric=metric) m.set_seed(42) for _, row in df.iterrows(): m.add_item(row[id], row['vectors']) m.build(TREE_QUERIES) return m
def find_candidates_udf(u_factor): from annoy import AnnoyIndex # must import here ! u = AnnoyIndex(rank, 'dot') u.set_seed(random_seed) u.load(SparkFiles.get( tree_ann_path)) # tree_ann_path must be absolute path return u.get_nns_by_vector(u_factor, n=nns, search_k=-1, include_distances=False)
def generate_semi_hard_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test): predictions = model.predict(sequences) t = AnnoyIndex( len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed t.set_seed(123) for i in range(len(predictions)): # print(predictions[i]) v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees triplets = {} triplets['anchor'] = [] triplets['positive'] = [] triplets['negative'] = [] if test: NNlen = TEST_NEIGHBOR_LEN else: NNlen = TRAIN_NEIGHBOR_LEN for key in entity2same: index = entity2unique[key] expected_text = set(entity2same[key]) expected_ids = [entity2unique[i] for i in expected_text] for positive in expected_text: k = entity2unique[positive] nearest = t.get_nns_by_vector(predictions[k], NNlen) dist_k = t.get_distance(index, k) semi_hards = [] for n in nearest: if n == index or n in expected_ids or n == k: continue n_dist = t.get_distance(index, n) if n_dist > dist_k: semi_hards.append(unique_text[n]) # shuffle(semi_hards) # semi_hards = semi_hards[0:20] for i in semi_hards: triplets['anchor'].append(key) triplets['positive'].append(unique_text[k]) triplets['negative'].append(i) return triplets
def generate_extra_pair_basis(basis, X, n_neighbors, tree: AnnoyIndex, distance='euclidean', verbose=True): '''Generate pairs that connects the extra set of data to the fitted basis. ''' npr, dimp = X.shape assert ( basis is not None or tree is not None ), "If the annoyindex is not cached, the original dataset must be provided." # Build the tree again if not cached if tree is None: n, dim = basis.shape assert dimp == dim, "The dimension of the original dataset is different from the new one's." tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, basis[i, :]) tree.build(20) else: n = tree.get_n_items() n_neighbors_extra = min(n_neighbors + 50, n - 1) nbrs = np.zeros((npr, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((npr, n_neighbors_extra), dtype=np.float32) for i in range(npr): nbrs[i, :], knn_distances[i, :] = tree.get_nns_by_vector( X[i, :], n_neighbors_extra, include_distances=True) print_verbose("Found nearest neighbor", verbose) # sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) # print_verbose("Calculated sigma", verbose) # Debug # print_verbose(f"Sigma is of the scale of {sig.shape}", verbose) # print_verbose(f"KNN dist is of shape scale of {knn_distances.shape}", verbose) # print_verbose(f"nbrs max: {nbrs.max()}", verbose) # scaling the distances is not possible since we don't always track the basis # scaled_dist = scale_dist(knn_distances, sig, nbrs) print_verbose("Found scaled dist", verbose) pair_neighbors = sample_neighbors_pair_basis(n, X, knn_distances, nbrs, n_neighbors) return pair_neighbors
def load_index(path_index: PathType, meta_d: Dict) \ -> AnnoyIndex: """ We rely on ANNOY's usage of mmap to be fast loading (fast enough that we can load it on every single call) """ n_dim = meta_d['n_dim'] metric = meta_d['metric'] u = AnnoyIndex( n_dim, metric=metric, ) u.load(str(path_index)) u.set_seed(SEED) return u
def test_seeding(self): f = 10 X = numpy.random.rand(1000, f) Y = numpy.random.rand(50, f) indexes = [] for i in range(2): index = AnnoyIndex(f, 'angular') index.set_seed(42) for j in range(X.shape[0]): index.add_item(j, X[j]) index.build(10) indexes.append(index) for k in range(Y.shape[0]): self.assertEquals(indexes[0].get_nns_by_vector(Y[k], 100), indexes[1].get_nns_by_vector(Y[k], 100))
def test_seeding(self): f = 10 X = numpy.random.rand(1000, f) Y = numpy.random.rand(50, f) indexes = [] for i in range(2): index = AnnoyIndex(f) index.set_seed(42) for j in range(X.shape[0]): index.add_item(j, X[j]) index.build(10) indexes.append(index) for k in range(Y.shape[0]): self.assertEquals(indexes[0].get_nns_by_vector(Y[k], 100), indexes[1].get_nns_by_vector(Y[k], 100))
def generate_pair(X, n_neighbors, n_MN, n_FP, distance='euclidean', verbose=True): '''Generate pairs for the dataset. ''' n, dim = X.shape # sample more neighbors than needed n_neighbors_extra = min(n_neighbors + 50, n - 1) tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, X[i, :]) tree.build(20) option = distance_to_option(distance=distance) nbrs = np.zeros((n, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((n, n_neighbors_extra), dtype=np.float32) for i in range(n): nbrs_ = tree.get_nns_by_item(i, n_neighbors_extra + 1) nbrs[i, :] = nbrs_[1:] for j in range(n_neighbors_extra): knn_distances[i, j] = tree.get_distance(i, nbrs[i, j]) print_verbose("Found nearest neighbor", verbose) sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) print_verbose("Calculated sigma", verbose) scaled_dist = scale_dist(knn_distances, sig, nbrs) print_verbose("Found scaled dist", verbose) pair_neighbors = sample_neighbors_pair(X, scaled_dist, nbrs, n_neighbors) if _RANDOM_STATE is None: pair_MN = sample_MN_pair(X, n_MN, option) pair_FP = sample_FP_pair(X, pair_neighbors, n_neighbors, n_FP) else: pair_MN = sample_MN_pair_deterministic(X, n_MN, _RANDOM_STATE, option) pair_FP = sample_FP_pair_deterministic(X, pair_neighbors, n_neighbors, n_FP, _RANDOM_STATE) return pair_neighbors, pair_MN, pair_FP, tree
def annoy_train(spark, dirname, rank, regParam, n_trees, random_seed): # Load model model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model') # get item factors item_factors = model.itemFactors item_factors, annoy_index_map = convert_annoy_index(item_factors) # train annoy model tree = AnnoyIndex(rank, 'dot') for item in tqdm(item_factors.collect()): tree.add_item(item.annoy_id, item.features) tree.set_seed(random_seed) # build the tree # num of trees: higher n_trees gives higher precision tree.build(n_trees) # save annoy model and index map tree.save(f'{dirname}_{rank}_{regParam}_tree.ann') annoy_index_map.write.parquet(f'{dirname}_{rank}_{regParam}_annoy_index_map.parquet')
def predict_similar_movies( review_vectors: pd.DataFrame, parameters: Dict ) -> pd.DataFrame: # 近似最近傍探索モデルを初期化 vector_size = review_vectors.iat[0, 1].size annoy_index = AnnoyIndex(vector_size, parameters["similarity_metrics"]) annoy_index.set_seed(parameters["random_seed"]) # TODO ログ print(f"review size: {len(review_vectors)}") # インデックスの構築 idx2movie = {} for i, row in enumerate(review_vectors.itertuples()): idx2movie[i] = row.movie_id annoy_index.add_item(i, row.vector) # TODO ログ if i % 10 == 0: print(f"{i}番目のインデックスを構築完了") annoy_index.build(parameters["n_tree"]) # 類似映画トップNを予測 similar_movies = {} for j in range(len(review_vectors)): # 同じ映画が最も近くなるため、1つずらす similar_movies[idx2movie[j]] = annoy_index.get_nns_by_item(j, parameters["predict_num"] + 1)[1:] if j % 10 == 0: print(f"{j}番目の推論完了") return pd.DataFrame({ "movie_id": similar_movies.keys(), "similar_movie_ids": [[idx2movie[movie_id] for movie_id in movie_list] for movie_list in similar_movies.values()] })
# 100%|############################| 402111/402111 [01:02<00:00, 6455.57it/s] len(wv.vocab), len(wv[next(iter(wv.vocab))]) # (3000000, 300) wv.vectors.shape # (3000000, 300) """ >>> from annoy import AnnoyIndex >>> num_words, num_dimensions = wv.vectors.shape # <1> >>> index = AnnoyIndex(num_dimensions) """ from annoy import AnnoyIndex num_words, num_dimensions = wv.vectors.shape # <1> index = AnnoyIndex(num_dimensions) index.set_seed(1983) """ >>> from tqdm import tqdm # <1> >>> for i, word in enumerate(tqdm(wv.index2word)): # <2> ... index.add_item(i, wv[word]) 22%|#######▏ | 649297/3000000 [00:26<01:35, 24587.52it/s] <1> `tqdm()` takes an iterable and returns an iterable (like `enumerate()`) and inserts code in your loop to display a progress bar <2> `.index2word` is an unsorted list of all 3M tokens in your vocabulary, equivalent to a map of the integer indexes (0-2999999) to tokens ('</s>' to 'snowcapped_Caucasus'). """ from tqdm import tqdm for i, word in enumerate(tqdm(wv.index2word)): index.add_item(i, wv[word])
def test_seed(self): i = AnnoyIndex(10, 'angular') i.load('test/test.tree') i.set_seed(42)
class AnnoyDictionary(object): def __init__(self, dict_size, key_width, new_value_shift_coefficient=0.1, batch_size=100, key_error_threshold=0.01): self.max_size = dict_size self.curr_size = 0 self.new_value_shift_coefficient = new_value_shift_coefficient self.index = AnnoyIndex(key_width, metric='euclidean') self.index.set_seed(1) self.embeddings = np.zeros((dict_size, key_width)) self.values = np.zeros(dict_size) self.lru_timestamps = np.zeros(dict_size) self.current_timestamp = 0.0 # keys that are in this distance will be considered as the same key self.key_error_threshold = key_error_threshold self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.key_dimension = key_width self.value_dimension = 1 self._reset_buffer() self.built_capacity = 0 def add(self, keys, values): # Adds new embeddings and values to the dictionary indices = [] indices_to_remove = [] for i in range(keys.shape[0]): index = self._lookup_key_index(keys[i]) if index: # update existing value self.values[index] += self.new_value_shift_coefficient * ( values[i] - self.values[index]) self.lru_timestamps[index] = self.current_timestamp indices_to_remove.append(i) else: # add new if self.curr_size >= self.max_size: # find the LRU entry index = np.argmin(self.lru_timestamps) else: index = self.curr_size self.curr_size += 1 self.lru_timestamps[index] = self.current_timestamp indices.append(index) for i in reversed(indices_to_remove): keys = np.delete(keys, i, 0) values = np.delete(values, i, 0) self.buffered_keys = np.vstack((self.buffered_keys, keys)) self.buffered_values = np.vstack((self.buffered_values, values)) self.buffered_indices = self.buffered_indices + indices if len(self.buffered_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, int(self.curr_size * 0.02)) self._rebuild_index() self.current_timestamp += 1 # Returns the stored embeddings and values of the closest embeddings def query(self, keys, k): if not self.has_enough_entries(k): # this will only happen when the DND is not yet populated with enough entries, which is only during heatup # these values won't be used and therefore they are meaningless return [0.0], [0.0], [0] _, indices = self._get_k_nearest_neighbors_indices(keys, k) embeddings = [] values = [] for ind in indices: self.lru_timestamps[ind] = self.current_timestamp embeddings.append(self.embeddings[ind]) values.append(self.values[ind]) self.current_timestamp += 1 return embeddings, values, indices def has_enough_entries(self, k): return self.curr_size > k and (self.built_capacity > k) def _get_k_nearest_neighbors_indices(self, keys, k): distances = [] indices = [] for key in keys: index, distance = self.index.get_nns_by_vector( key, k, include_distances=True) distances.append(distance) indices.append(index) return distances, indices def _rebuild_index(self): self.index.unbuild() self.embeddings[self.buffered_indices] = self.buffered_keys self.values[self.buffered_indices] = np.squeeze(self.buffered_values) for idx, key in zip(self.buffered_indices, self.buffered_keys): self.index.add_item(idx, key) self._reset_buffer() self.index.build(50) self.built_capacity = self.curr_size def _reset_buffer(self): self.buffered_keys = np.zeros((0, self.key_dimension)) self.buffered_values = np.zeros((0, self.value_dimension)) self.buffered_indices = [] def _lookup_key_index(self, key): distance, index = self._get_k_nearest_neighbors_indices([key], 1) if distance != [[]] and distance[0][0] <= self.key_error_threshold: return index return None
class Annoy(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "manhattan", "hamming", "dot", "l1", "l2", "taxicab", ] def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.__data = None def build(self, data, k): from annoy import AnnoyIndex N = data.shape[0] annoy_metric = self.metric annoy_aliases = { "cosine": "angular", "l1": "manhattan", "l2": "euclidean", "taxicab": "manhattan", } if annoy_metric in annoy_aliases: annoy_metric = annoy_aliases[annoy_metric] self.index = AnnoyIndex(data.shape[1], annoy_metric) if self.random_state: self.index.set_seed(self.random_state) for i in range(N): self.index.add_item(i, data[i]) # Number of trees. FIt-SNE uses 50 by default. self.index.build(50) # Return the nearest neighbors in the training set distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(i): # Annoy returns the query point itself as the first element indices_i, distances_i = self.index.get_nns_by_item( i, k + 1, include_distances=True ) indices[i] = indices_i[1:] distances[i] = distances_i[1:] if self.n_jobs == 1: for i in range(N): getnns(i) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")( delayed(getnns)(i) for i in range(N) ) return indices, distances def query(self, query, k): N = query.shape[0] distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(v): # Annoy returns the query point itself as the first element indices_i, distances_i = self.index.get_nns_by_vector( v, k + 1, include_distances=True ) indices[i] = indices_i[1:] distances[i] = distances_i[1:] if self.n_jobs == 1: for i in range(N): getnns(query[i]) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")( delayed(getnns)(query[i]) for i in range(N) ) return indices, distances
log.debug(f'df_click shape: {df_click.shape}') log.debug(f'{df_click.head()}') # 得到每个文章id对应的词向量 article_vec_map = word2vec(df_click, 'user_id', 'click_article_id', model_path) f = open(w2v_file, 'wb') # 将得到的词向量 保存至文件中 pickle.dump(article_vec_map, f) f.close() # 说白了就是先将加载进来的vector进行相似临近计算,然后生成一个树形结构的索引,这样查找速度会变得很快,只不过会牺牲一定的近似精度。 # 将 embedding 建立索引 article_index = AnnoyIndex( 256, 'angular') #metric='angular'表示使用 angular(余弦)距离度量来计算簇和哈希。 article_index.set_seed(2020) ##加载article_id和向量映射,添加到annoyIndex for article_id, emb in tqdm(article_vec_map.items()): article_index.add_item(article_id, emb) # tree_num设置为100,在内存允许的情况下,越大越好 article_index.build(100) user_item_ = df_click.groupby('user_id')['click_article_id'].agg( lambda x: list(x)).reset_index() user_item_dict = dict( zip(user_item_['user_id'], user_item_['click_article_id'])) # 召回 n_split = max_threads all_users = df_query['user_id'].unique() shuffle(all_users)
class Annoy_Dict(LRU_KNN_ANNOY): def __init__(self, config): super(Annoy_Dict, self).__init__(config) self.config = config self.key_dim = self.config.knn_key_dim self.index = AnnoyIndex(self.key_dim, metric='euclidean') self.index.set_seed(123) self.initial_update_size = self.config.knn_dict_update_step self.min_update_size = self.initial_update_size self.cached_embs = [] self.cached_vals = [] self.cached_terminals = [] self.cached_embs_next = [] self.cached_indices = [] self.build_capacity = 0 def _nn(self, keys, k): assert np.ndim(keys) == 2 dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, terminal, keys_next, indices): self.cached_embs = self.cached_embs + keys self.cached_vals = self.cached_vals + values self.cached_terminals = self.cached_terminals + terminal self.cached_embs_next = self.cached_embs_next + keys_next self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: # self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_emb = self.cached_embs[i] new_val = self.cached_vals[i] new_t = self.cached_terminals[i] new_emb_next = self.cached_embs_next[i] self.embs[ind] = new_emb self.values[ind] = new_val self.terminal[ind] = new_t self.embs_next[ind] = new_emb_next self.index.add_item(ind, new_emb) self.cached_embs = [] self.cached_vals = [] self.cached_terminals = [] self.cached_embs_next = [] self.cached_indices = [] self.index.build(50) self.build_capacity = self.curr_capacity def _rebuild(self): self.index.unbuild() for ind, emb in enumerate(self.embs[:self.curr_capacity]): self.index.add_item(ind, emb) self.index.build(50) self.build_capacity = self.curr_capacity def queryable(self, k): return (LRU_KNN_ANNOY.queryable(self, k) and (self.build_capacity > k)) @property def capacity_(self): # print("self.index.get_n_items: ", self.index.get_n_items()) return self.index.get_n_items()
def get_knn_graph(X, k=5, dist_metric='euclidean', approx=False, return_edges=True, random_seed=0): ''' Build k-nearest-neighbor graph Return edge list and nearest neighbor matrix ''' t0 = time.time() if approx: try: from annoy import AnnoyIndex except: approx = False print( 'Could not find library "annoy" for approx. nearest neighbor search' ) if approx: #print('Using approximate nearest neighbor search') if dist_metric == 'cosine': dist_metric = 'angular' npc = X.shape[1] ncell = X.shape[0] annoy_index = AnnoyIndex(npc, metric=dist_metric) annoy_index.set_seed(random_seed) for i in range(ncell): annoy_index.add_item(i, list(X[i, :])) annoy_index.build(10) # 10 trees knn = [] for iCell in range(ncell): knn.append(annoy_index.get_nns_by_item(iCell, k + 1)[1:]) knn = np.array(knn, dtype=int) else: #print('Using sklearn NearestNeighbors') if dist_metric == 'cosine': nbrs = NearestNeighbors(n_neighbors=k, metric=dist_metric, algorithm='brute').fit(X) else: nbrs = NearestNeighbors(n_neighbors=k, metric=dist_metric).fit(X) knn = nbrs.kneighbors(return_distance=False) if return_edges: links = set([]) for i in range(knn.shape[0]): for j in knn[i, :]: links.add(tuple(sorted((i, j)))) t_elapse = time.time() - t0 #print('kNN graph built in %.3f sec' %(t_elapse)) return links, knn return knn
class alpha_KNN: def __init__(self, capacity, key_dimension, delta=0.001, alpha=0.1, batch_size=1000): self.capacity = capacity self.curr_capacity = 0 self.delta = delta self.alpha = 0.001 self.embeddings = np.zeros((capacity, key_dimension)) self.values = np.zeros(capacity) self.weights = np.zeros(capacity) from annoy import AnnoyIndex self.index = AnnoyIndex(key_dimension, metric='euclidean') self.index.set_seed(123) self.min_update_size = batch_size self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.built_capacity = 0 def _nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key + [1.0], k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, indices): self.cached_keys = self.cached_keys + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: self._rebuild_index() def _rebuild_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_key = self.cached_keys[i] new_value = self.cached_values[i] self.embeddings[ind] = new_key self.values[ind] = new_value self.weights[ind] = new_weight self.index.add_item(ind, new_key + [new_weight]) self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def queryable(self, k): return (self.built_capacity > k) # Returns the stored embeddings and values of the closest embeddings def query(self, keys, k): _, indices = self._nn(keys, k) embs = [] values = [] weights = [] for ind in indices: embs.append(self.embeddings[ind]) values.append(self.values[ind]) weights.append(self.weights[ind]) return embs, values, weights # Adds new embeddings (and values) to the dictionary def add(self, keys, values): if self.queryable(5): dists, inds = self._nn(keys, k=5) for ind, dist in enumerate(dists): for i, d in enumerate(dist): index = inds[ind][i] self.weights[index] *= (1 - self.alpha) indices, keys_, values_ = [], [], [] for i, _ in enumerate(keys): if self.curr_capacity >= self.capacity: # find the LRU entry index = np.argmin(self.weights) else: index = self.curr_capacity self.curr_capacity += 1 self.weights[index] = 1.0 indices.append(index) keys_.append(keys[i]) values_.append(values[i]) self._insert(keys_, values_, indices)
class annoy_dict(LRU_KNN): def __init__(self, capacity, key_dimension, delta=0.001, alpha=0.1, batch_size=100): LRU_KNN.__init__(self, capacity, key_dimension, delta, alpha) from annoy import AnnoyIndex self.index = AnnoyIndex(key_dimension, metric='euclidean') self.index.set_seed(123) self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.built_capacity = 0 def _nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def _insert(self, keys, values, indices): self.cached_keys = self.cached_keys + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_indices) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_key = self.cached_keys[i] new_value = self.cached_values[i] self.embeddings[ind] = new_key self.values[ind] = new_value self.index.add_item(ind, new_key) self.cached_keys = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def _rebuild_index(self): self.index.unbuild() for ind, emb in enumerate(self.embeddings[:self.curr_capacity]): self.index.add_item(ind, emb) self.index.build(50) self.built_capacity = self.curr_capacity def queryable(self, k): return (LRU_KNN.queryable(self, k) and (self.built_capacity > k))
class LRU_KNN: def __init__(self, capacity, key_dim, value_dim, batch_size): self.capacity = capacity self.curr_capacity = 0 self.states = np.zeros((capacity, key_dim)) self.values = np.zeros((capacity, value_dim)) self.lru = np.zeros(capacity) self.tm = 0.0 self.index = AnnoyIndex(key_dim, metric="euclidean") self.index.set_seed(123) self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.cached_states = [] self.cached_values = [] self.cached_indices = [] def nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def query(self, keys, k): _, indices = self.nn(keys, k) states = [] values = [] for ind in indices: self.lru[ind] = self.tm states.append(self.states[ind]) values.append(self.values[ind]) self.tm += 0.001 return states, values def _insert(self, keys, values, indices): self.cached_states = self.cached_states + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_state = self.cached_states[i] new_value = self.cached_values[i] self.states[ind] = new_state self.values[ind] = new_value self.index.add_item(ind, new_state) self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.built_capacity = self.curr_capacity
#!/usr/bin/env python # encoding: utf-8 from annoy import AnnoyIndex test_dims = [64] for dim in test_dims: a = AnnoyIndex(dim, 'angular') d = AnnoyIndex(dim, 'dot') e = AnnoyIndex(dim, 'euclidean') a.set_seed(123) d.set_seed(123) e.set_seed(123) vectors = open('item_vector.txt').readlines() for index, vector in enumerate(vectors): v = [float(x) for x in vector.split(',')] a.add_item(index, v) d.add_item(index, v) e.add_item(index, v) a.build(3) a.save('points.angular.annoy.{}'.format(dim)) d.build(3) d.save('points.dot.annoy.{}'.format(dim)) e.build(3) e.save('points.euclidean.annoy.{}'.format(dim))
import numpy as np from annoy import AnnoyIndex X = np.random.rand(100000, 60) Y = np.random.rand(500, 60) annoy1 = AnnoyIndex(60) annoy1.set_seed(100) for i in range(X.shape[0]): annoy1.add_item(i, X[i, :]) annoy1.build(10) annoy2 = AnnoyIndex(60) annoy2.set_seed(100) for j in range(X.shape[0]): annoy2.add_item(j, X[j, :]) annoy2.build(10) result1 = [] result2 = [] for k in range(Y.shape[0]): print "annoy1", annoy1.get_nns_by_vector(Y[k, :], 3) result1 += annoy1.get_nns_by_vector(Y[k, :], 3) print "annoy2", annoy2.get_nns_by_vector(Y[k, :], 3) result2 += annoy2.get_nns_by_vector(Y[k, :], 3)
class Memory: def __init__(self, capacity, state_dim, value_dim): self.capacity = capacity print("state_dim:", state_dim) self.states = np.zeros((capacity, state_dim)) self.values = np.zeros((capacity, value_dim)) self.curr_capacity = 0 self.curr_ = 0 self.lru = np.zeros(capacity) self.tm = 0 self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index = AnnoyIndex(state_dim) self.index.set_seed(123) self.update_size = 1 self.build_capacity = 0 def sample_knn_test(self, state, k): inds, dists = self.index.get_nns_by_vector(state, k, include_distances=True) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample_knn(self, states, k): dists = [] inds = [] for state in states: ind, dist = self.index.get_nns_by_vector(state, k, include_distances=True) inds.append(ind) dists.append(dist) # inds = np.reshape(np.array(inds), -1) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample(self, n_samples): if self.curr_capacity < n_samples or n_samples == 0: idx = np.random.choice(np.arange(len(self.states)), n_samples, replace=False) else: idx = np.random.choice(np.arange(self.curr_capacity), n_samples, replace=False) self.tm += 0.01 self.lru[idx] = self.tm embs = self.states[idx] values = self.values[idx] return embs, values def add_knn(self, states, values): self._add_knn(states, values) def add_knn_lru(self, states, values): self._add_knn(states, values, lru=True) def add(self, states, values): self._add(states, values) def add_lru(self, states, values): self._add(states, values, lru=True) def add_rand(self, states, values): self._add(states, values, rand=True) def _insert(self, states, values, indices): self.cached_states = self.cached_states + states self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.update_size: self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): self.states[ind] = self.cached_states[i] self.values[ind] = self.cached_values[i] self.index.add_item(ind, self.cached_states[i]) self.index.build(50) self.build_capacity = self.curr_capacity self.cached_states = [] self.cached_values = [] self.cached_indices = [] def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.build_capacity = self.curr_capacity def _add_knn(self, states, values, lru=False): # print(states) indices = [] states_ = [] values_ = [] for i, _ in enumerate(states): if lru: if self.curr_capacity >= self.capacity: ind = np.argmin(self.lru) else: ind = self.curr_capacity self.curr_capacity += 1 else: if self.curr_capacity >= self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity ind = self.curr_ else: ind = self.curr_capacity self.curr_capacity += 1 self.lru[ind] = self.tm indices.append(ind) states_.append(states[i]) values_.append(values[i]) self._insert(states_, values_, indices) def _add(self, states, values, rand=False, lru=False): for i, state in enumerate(states): if self.curr_capacity < self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity # self.states[self.curr_] = state # self.values[self.curr_] = values[i] if self.curr_capacity < self.capacity: self.curr_capacity += 1 else: if lru: self.curr_ = np.argmin(self.lru) if rand: self.curr_ = np.random.choice(np.arange( self.curr_capacity), 1, replace=False) if not lru and not rand: self.curr_ = (self.curr_ + 1) % self.capacity self.states[self.curr_] = state self.values[self.curr_] = values[i] @property def length(self): # assert self.index.get_n_items() == self.curr_capacity # return self.curr_capacity return self.index.get_n_items()
def test_seed(self): i = AnnoyIndex(10) i.load('test/test.tree') i.set_seed(42)
def generate_triplets_from_ANN(model, sequences, entity2unique, entity2same, unique_text, test): predictions = model.predict(sequences) t = AnnoyIndex(len(predictions[0]), metric='euclidean') # Length of item vector that will be indexed t.set_seed(123) for i in range(len(predictions)): # print(predictions[i]) v = predictions[i] t.add_item(i, v) t.build(100) # 100 trees match = 0 no_match = 0 ann_accuracy = 0 total = 0 triplets = {} pos_distances = [] neg_distances = [] triplets['anchor'] = [] triplets['positive'] = [] triplets['negative'] = [] if test: NNlen = TEST_NEIGHBOR_LEN else: NNlen = TRAIN_NEIGHBOR_LEN for key in entity2same: index = entity2unique[key] nearest = t.get_nns_by_vector(predictions[index], NNlen) nearest_text = set([unique_text[i] for i in nearest]) expected_text = set(entity2same[key]) # annoy has this annoying habit of returning the queried item back as a nearest neighbor. Remove it. if key in nearest_text: nearest_text.remove(key) # print("query={} names = {} true_match = {}".format(unique_text[index], nearest_text, expected_text)) overlap = expected_text.intersection(nearest_text) # collect up some statistics on how well we did on the match m = len(overlap) match += m # since we asked for only x nearest neighbors, and we get at most x-1 neighbors that are not the same as key (!) # make sure we adjust our estimate of no match appropriately no_match += min(len(expected_text), NNlen - 1) - m # sample only the negatives that are true negatives # that is, they are not in the expected set - sampling only 'semi-hard negatives is not defined here' # positives = expected_text - nearest_text positives = overlap negatives = nearest_text - expected_text # print(key + str(expected_text) + str(nearest_text)) for i in negatives: for j in positives: dist_pos = t.get_distance(index, entity2unique[j]) pos_distances.append(dist_pos) dist_neg = t.get_distance(index, entity2unique[i]) neg_distances.append(dist_neg) if dist_pos < dist_neg: ann_accuracy += 1 total += 1 # print(key + "|" + j + "|" + i) # print(dist_pos) # print(dist_neg) for i in negatives: for j in expected_text: triplets['anchor'].append(key) triplets['positive'].append(j) triplets['negative'].append(i) print("mean positive distance:" + str(statistics.mean(pos_distances))) print("stdev positive distance:" + str(statistics.stdev(pos_distances))) print("max positive distance:" + str(max(pos_distances))) print("mean neg distance:" + str(statistics.mean(neg_distances))) print("stdev neg distance:" + str(statistics.stdev(neg_distances))) print("max neg distance:" + str(max(neg_distances))) print("Accuracy in the ANN for triplets that obey the distance func:" + str(ann_accuracy / total)) obj = {} obj['accuracy'] = ann_accuracy / total obj['steps'] = 1 with open(output_file_name_for_hpo, 'w') as out: json.dump(obj, out) if test: return match/(match + no_match) else: return triplets, match/(match + no_match)
class Annoy(KNNIndex): VALID_METRICS = [ "cosine", "euclidean", "manhattan", "hamming", "dot", "l1", "l2", "taxicab", ] def build(self, data, k): timer = utils.Timer( f"Finding {k} nearest neighbors using Annoy approximate search using " f"{self.metric} distance...", verbose=self.verbose, ) timer.__enter__() from annoy import AnnoyIndex N = data.shape[0] annoy_metric = self.metric annoy_aliases = { "cosine": "angular", "l1": "manhattan", "l2": "euclidean", "taxicab": "manhattan", } if annoy_metric in annoy_aliases: annoy_metric = annoy_aliases[annoy_metric] self.index = AnnoyIndex(data.shape[1], annoy_metric) if self.random_state: self.index.set_seed(self.random_state) for i in range(N): self.index.add_item(i, data[i]) # Number of trees. FIt-SNE uses 50 by default. self.index.build(50) # Return the nearest neighbors in the training set distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(i): # Annoy returns the query point itself as the first element indices_i, distances_i = self.index.get_nns_by_item( i, k + 1, include_distances=True) indices[i] = indices_i[1:] distances[i] = distances_i[1:] if self.n_jobs == 1: for i in range(N): getnns(i) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")(delayed(getnns)(i) for i in range(N)) timer.__exit__() return indices, distances def query(self, query, k): timer = utils.Timer( f"Finding {k} nearest neighbors in existing embedding using Annoy " f"approximate search...", self.verbose, ) timer.__enter__() N = query.shape[0] distances = np.zeros((N, k)) indices = np.zeros((N, k)).astype(int) def getnns(i): indices[i], distances[i] = self.index.get_nns_by_vector( query[i], k, include_distances=True) if self.n_jobs == 1: for i in range(N): getnns(i) else: from joblib import Parallel, delayed Parallel(n_jobs=self.n_jobs, require="sharedmem")(delayed(getnns)(i) for i in range(N)) timer.__exit__() return indices, distances