class spherefaceAnnoyDatabase(): def __init__(self): self.network = caffe.Net("pretrainedModels/sphereface_deploy.prototxt", "pretrainedModels/sphereface_model.caffemodel",0) self.index = AnnoyIndex(512, metric='angular') # 512 is the number of neurons in the last layer of the net self.indexToName = {} self.nameToIndex = {} def getEmbedding(self, imgPath): img = Image.open(imgPath) sampleImage = numpy.array(img.resize((net.blobs['data'].data.shape[3],net.blobs['data'].data.shape[2]))) sampleImage = numpy.reshape(sampleImage,(1,)+sampleImage.shape).transpose(0,3,1,2).astype(numpy.float32) net.blobs['data'].data[...]=sampleImage net.forward() return net.blobs['fc5'].data[0].copy() def addFaceWithName(self, imgPath, name): embedding = self.getEmbedding(imgPath) length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = name self.nameToIndex[name] = length def addEmbeddingWithName(self, embedding, name): length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = name self.nameToIndex[name] = length def addFaceWithoutName(self, imgPath): embedding = self.getEmbedding(imgPath) length = self.index.get_n_items() self.index.add_item(length, embedding) self.indexToName[length] = imgPath self.nameToIndex[imgPath] = length def freeze(self, nTrees = 20): self.index.build(nTrees) def lookupByFace(self, imgPath, numberOfNeighbours): embedding = self.getEmbedding(imgPath) results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results def lookupByEmbedding(self, embedding, numberOfNeighbours): if(numberOfNeighbours==-1): numberOfNeighbours = self.index.get_n_items() results = self.index.get_nns_by_vector(embedding, numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results def lookupByName(self, name, numberOfNeighbours): if(numberOfNeighbours==-1): numberOfNeighbours = self.index.get_n_items() results = self.index.get_nns_by_item(self.nameToIndex[name], numberOfNeighbours, search_k=-1, include_distances=True) for i in xrange(len(results[0])): results[0][i] = self.indexToName[results[0][i]] return results
def create_walks(df,index_file,patient_dict_file,index_dict_file,n_neighbors = 25,walks_per_patient=10,walk_size=50,out_dir="./"): index = AnnoyIndex(df.shape[1]) index.load(index_file) patient_dict = {} for key, val in csv.reader(open(patient_dict_file)): patient_dict[key] = int(val) index_dict = {} for key, val in csv.reader(open(index_dict_file)): index_dict[int(key)] = val print("Computing nearest-neighbors...") neighbor_dict = {} for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] neighbors = index.get_nns_by_item(i=i, n=n_neighbors, search_k=-1, include_distances=False) neighbor_ids = [index_dict[x] for x in neighbors] neighbor_dict[patient_id] = neighbor_ids f = open(out_dir+"patient_walks.txt", 'wb') for i in range(index.get_n_items()): if i % 1000 == 0: print str(i) patient_id = index_dict[i] patient_sentences = "" for j in range(walks_per_patient): sentence = generate_sentence(start=patient_id,neighbor_dict=neighbor_dict, n_neighbors=n_neighbors,walk_size=walk_size) patient_sentences = sentence + "\n" ## Write it ## f.write(patient_sentences)
def build_annoy_indices(input_words, input_vectors): print("Building Annoy Indices: {0}".format(datetime.now().time())) sem = AnnoyIndex(99, metric="euclidean") phon = AnnoyIndex(100, metric="euclidean") index = 0 print("Reading Data for Semantic Index: {0}".format(datetime.now().time())) for row in open("semantic_vectors_weighted82.txt"): spl = row.find("@@@") line = row[0:spl - 1].lower() vec = row[spl + 3:-1] vals = np.array([float(val) for val in vec.split(", ")]) if line not in lookup: sem.add_item(index, vals) slines[index] = line lookup[line] = [index] index += 1 if index % 100000 == 0: print("......{0} vectors loaded.".format(index)) last_index = index + 1 for i in range(len(input_words)): sem.add_item(last_index, input_vectors[i] ) #add input vector so its neighbors can be calculated lookup[input_words[i]] = [last_index] slines[last_index] = input_words[i] last_index += 1 print("Building Semantic Index: {0}".format(datetime.now().time())) sem.build(150) print("Built: {0}".format(datetime.now().time())) print("Num items in semantic index: {0}".format(sem.get_n_items())) print("Reading Data for Phonetic Index: {0}".format(datetime.now().time())) pindex = 0 for row in open("phonetic_vectors_every2_d100_reformatted.txt"): spl = row.find("@@@") line = row[0:spl - 1] stripped_line = line[2:-1].lower() #skip the b'' vec = row[spl + 3:-1] vals = np.array([float(val) for val in vec.split(", ")]) if stripped_line in lookup: phon.add_item(pindex, vals) lookup[stripped_line].append(pindex) plines[pindex] = stripped_line pindex += 1 if pindex % 100000 == 0: print("......{0} vectors loaded.".format(pindex)) print("Building Phonetic Index: {0}".format(datetime.now().time())) phon.build(150) print("Built: {0}".format(datetime.now().time())) print("Num items in phonetic index: {0}".format(phon.get_n_items())) print("Done Building Annoy Indices: {0}".format(datetime.now().time())) return sem, phon
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEquals(a.get_n_items(), 4) a.get_item_vector(3) a.save('something.annoy') self.assertEquals(a.get_n_items(), 4) a.get_item_vector(3)
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3, 'angular') a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def test_get_n_items(self): print "test_get_n_items" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0) #i.verbose(True) i.create() i.add_item(0, [0, 0, 1]) self.assertEqual(i.get_n_items(), 1) i.add_item(1, [0, 1, 0]) self.assertEqual(i.get_n_items(), 2) i.add_item(2, [1, 0, 0]) self.assertEqual(i.get_n_items(), 3)
def test_get_n_items(self): print "test_get_n_items" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0) #i.verbose(True) i.create() i.add_item(0, [0, 0, 1]) self.assertEqual(i.get_n_items(), 1); i.add_item(1, [0, 1, 0]) self.assertEqual(i.get_n_items(), 2); i.add_item(2, [1, 0, 0]) self.assertEqual(i.get_n_items(), 3);
def test_item_vector_after_save(self): # Issue #279 a = AnnoyIndex(3) a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3])) a.save('something.annoy') self.assertEqual(a.get_n_items(), 4) self.assertEqual(a.get_item_vector(3), [0, 0, 1]) self.assertEqual(set(a.get_nns_by_item(1, 999)), set([1, 2, 3]))
def main(): t = AnnoyIndex(99, metric='euclidean') lines = dict() lookup = dict() print("loading...") index = 0 for row in open("semantic_vectors_weighted91.txt"): spl = row.find("@@@") line = row[0:spl-1].lower() vec = row[spl+3:-1] vals = np.array([float(val) for val in vec.split(", ")]) if line in lookup: continue t.add_item(index, vals) lines[index] = line lookup[line] = [index] index += 1 if index % 50000 == 0: print(line) print("{0} vectors loaded".format(index)) print("building") t.build(100) print("done.") nums1 = [random.randint(1, t.get_n_items()) for i in range(5)] nums2 = [random.randint(1, t.get_n_items()) for i in range(5)] poem = [nums1, nums2] for s in poem: for line in s: print(lines[line]) print("\n")
def k_neighbors(shape_features: {}, db_features: {}, k=s.KNN_SIZE) -> []: """ It determines the closest shape to the query shape by computing K-Nearest Neighbors on a N-dimensional Approximate Nearest Neighbors feature mapping. ---------------------------- Args: shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape db_features (obj: 'dict): The dictionary containing the feature metrics of the shapes k (int): The number of neighbors to return, the default value specified in Settings Returns: neighbors (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective distance to the query shape (value) """ ann = AnnoyIndex(56, 'euclidean') # 56 features for id, featureList in db_features.items(): features_flatten = flatten_features_array(featureList) ann.add_item(id, features_flatten) shape_features_flat = flatten_features_array(shape_features) # To get the neighbors, it is necessary to add the new item to the mapping first shape_id = ann.get_n_items() ann.add_item(shape_id, shape_features_flat) ann.build(s.CATEGORIES) neighbors = ann.get_nns_by_item(shape_id, k, include_distances=True) return neighbors
def AddToTrain(individual): global annoy_train global test_db global IND_SIZE global config max_memory = 5 if set.get_master_volume() == 1: print set.get_master_volume() set.set_master_volume(0.85) test_db.append(individual) print "SAVING TO TRAINING SET. TestDB Size: " + str(len(test_db)) annoy_train = AnnoyIndex(IND_SIZE) annoy_train.add_item(annoy_train.get_n_items(), individual) annoy_train.build(config["annoy_tree"]) # 10 trees if len(test_db) > max_memory: test_db.pop(0) print "delete old memory entry" if set.get_master_volume() == 0: test_db = [] # gen_record = [] annoy_train = AnnoyIndex(IND_SIZE) annoy_train.build(config["annoy_tree"]) # 10 trees print "clean set" set.set_master_volume(0.85)
def build_index(embedding_fun, batch_size, sentences): ann = AnnoyIndex(D) batch_sentences = [] batch_indexes = [] last_indexed = 0 num_batches = 0 with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) with open('wiki.txt.uniq', 'r') as fr: for sindex, sentence in enumerate(fr): batch_sentences.append(sentence) batch_indexes.append(sindex) if len(batch_sentences) == batch_size: context_embed = sess.run( embedding_fun, feed_dict={sentences: batch_sentences}) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) batch_sentences = [] batch_indexes = [] last_indexed += batch_size if num_batches % 10000 == 0: print_with_time('sindex: {} annoy_size: {}'.format( sindex, ann.get_n_items())) num_batches += 1 if batch_sentences: context_embed = sess.run( embedding_fun, feed_dict={sentences: batch_sentences}) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) return ann
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
def test_get_n_item(self): print "test_get_n_item" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0) i.create() i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i1 = i.get_n_items(0) self.assertEqual(i1, [0, 0, 1]) i2 = i.get_n_items(1) self.assertEqual(i2, [0, 1, 0]) i3 = i.get_n_items(2) self.assertEqual(i3, [1, 0, 0])
def test_get_n_item(self): print "test_get_n_item" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0) i.create() i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i1 = i.get_n_items(0); self.assertEqual(i1, [0, 0, 1]); i2 = i.get_n_items(1); self.assertEqual(i2, [0, 1, 0]); i3 = i.get_n_items(2); self.assertEqual(i3, [1, 0, 0]);
def test_only_one_item(self): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100) idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 1) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
def main(): args = setup_args() print_with_time(args) start_time = time.time() ann = AnnoyIndex(args.vector_size, metric='angular') ann.load(args.ann) end_time = time.time() print('Load Time: {}'.format(end_time - start_time)) print_with_time('Annoy Index: {}'.format(ann.get_n_items())) start_time = time.time() df = read_data(args.csv_file_path, args.filter_data) content_array = df.to_numpy() end_time = time.time() print_with_time('Sentences: {} Time: {}'.format(len(content_array), end_time - start_time)) # start_time = time.time() # embed_fn = hub.load(args.use_model) # end_time = time.time() # print_with_time('Model loaded time: {}'.format(end_time - start_time)) random_projection_matrix = None if args.random_projection: if os.path.exists('random_projection_matrix'): print("Loading random projection matrix...") with open('random_projection_matrix', 'rb') as handle: random_projection_matrix = pickle.load(handle) print('random projection matrix is loaded.') while True: input_sentence_id = input('Enter sentence id: ').strip() if input_sentence_id == 'q': return print_with_time('Input Sentence: {}'.format(input_sentence_id)) query_filter = 'GUID == "' + input_sentence_id + '"' input_data_object = df.query(query_filter) input_sentence = input_data_object['CONTENT'] start_time = time.time() query_sentence_vector = generate_embeddings(input_sentence.values[0], args.use_model, random_projection_matrix) print_with_time('vec done') similar_sentences = find_similar_items(ann, query_sentence_vector, content_array, args.k) end_time = time.time() print_with_time('nns done: Time: {}'.format(end_time-start_time)) for sentence in similar_sentences[1:]: if args.filter_data: if sentence[2] in ['country-related', 'person-related']: print(sentence[0]) else: print(sentence[0])
def test_dense_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) index = build_annoy_index(data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5) loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5)
def test_build_unbuid(self): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(1000): i.add_item(j, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): i.unbuild() i.build(10) self.assertEqual(i.get_n_items(), 1000)
def get_similar_sentences(query): embeddings = load_embeddings() sentence_ids = load_sentence_ids() index = AnnoyIndex(get_embeddings_dim(embeddings), "angular") index.load("index.ann") print("Found {} items in the index.".format(index.get_n_items())) print("The index uses {} trees.".format(index.get_n_trees())) print("") closest, dists = index.get_nns_by_vector( embeddings[query], 10, include_distances=True) # noqa: E501 assert (len(closest) == len(dists)) closest = map(lambda sid: sentence_ids[sid], closest) return zip(closest, dists)
def generate_extra_pair_basis(basis, X, n_neighbors, tree: AnnoyIndex, distance='euclidean', verbose=True): '''Generate pairs that connects the extra set of data to the fitted basis. ''' npr, dimp = X.shape assert ( basis is not None or tree is not None ), "If the annoyindex is not cached, the original dataset must be provided." # Build the tree again if not cached if tree is None: n, dim = basis.shape assert dimp == dim, "The dimension of the original dataset is different from the new one's." tree = AnnoyIndex(dim, metric=distance) if _RANDOM_STATE is not None: tree.set_seed(_RANDOM_STATE) for i in range(n): tree.add_item(i, basis[i, :]) tree.build(20) else: n = tree.get_n_items() n_neighbors_extra = min(n_neighbors + 50, n - 1) nbrs = np.zeros((npr, n_neighbors_extra), dtype=np.int32) knn_distances = np.empty((npr, n_neighbors_extra), dtype=np.float32) for i in range(npr): nbrs[i, :], knn_distances[i, :] = tree.get_nns_by_vector( X[i, :], n_neighbors_extra, include_distances=True) print_verbose("Found nearest neighbor", verbose) # sig = np.maximum(np.mean(knn_distances[:, 3:6], axis=1), 1e-10) # print_verbose("Calculated sigma", verbose) # Debug # print_verbose(f"Sigma is of the scale of {sig.shape}", verbose) # print_verbose(f"KNN dist is of shape scale of {knn_distances.shape}", verbose) # print_verbose(f"nbrs max: {nbrs.max()}", verbose) # scaling the distances is not possible since we don't always track the basis # scaled_dist = scale_dist(knn_distances, sig, nbrs) print_verbose("Found scaled dist", verbose) pair_neighbors = sample_neighbors_pair_basis(n, X, knn_distances, nbrs, n_neighbors) return pair_neighbors
def merge_indicies(self, index_file_a, index_file_b, sender_urn): logger.info("Merging {0} and {1} for {2} index".format(index_file_a, index_file_b, sender_urn)) index_a = AnnoyIndex(self.feat_size, metric='euclidean') index_b = AnnoyIndex(self.feat_size, metric='euclidean') new_index = AnnoyIndex(self.feat_size, metric='euclidean') index_a.load(index_file_a) index_b.load(index_file_b) cnt = 0 for i in range(index_a.get_n_items()): new_index.add_item(cnt, index_a.get_item_vector(i)) cnt += 1 for i in range(index_b.get_n_items()): new_index.add_item(cnt, index_b.get_item_vector(i)) cnt += 1 new_index_file = index_file_a + ".merged" index_a.unload() index_b.unload() new_index.build(self.n_trees) new_index.save(new_index_file) logger.info("Merging {0} and {1} for {2} index, total number of items: {3}".format( index_file_a, index_file_b, sender_urn, cnt)) new_index.unload() pykka.ActorRegistry.get_by_urn(sender_urn).proxy().complete_compaction( new_index_file=new_index_file, index_file_a=index_file_a, index_file_b=index_file_b )
def make_pairs(id): index = AnnoyIndex(dim, 'euclidean') index.load(model_path) k = min(index.get_n_items() - 1, max(50, min(index.get_n_items() / 5, 200))) #eh idk if this is necessary lol last_k = 0 found = 0 i = 0 while found < 15 and i < 5 and k < index.get_n_items( ): #loop until we get a satisfactory amount (15 for now), or for 5 iterations i += 1 neighbors = get_neighbors(id, k)[last_k:] for neighbor in neighbors: if neighbor == id: continue p = Pair.query.filter_by(hash=str( hash(f'{min(id, neighbor)}-{max(id, neighbor)}'))).first() print(p) if not p: found += 1 new_pair = Pair(min(id, neighbor), max(id, neighbor)) db_session.add(new_pair) db_session.commit()
def test_build_sparse_annoy_index(annoy_index_file): data = np.random.choice([0, 1], size=(10, 5)) sparse_data = csr_matrix(data) index = build_annoy_index(sparse_data, annoy_index_file) assert os.path.exists(annoy_index_file) loaded_index = AnnoyIndex(5, metric='angular') loaded_index.load(annoy_index_file) assert index.f == loaded_index.f == 5 assert index.get_n_items() == loaded_index.get_n_items() == 10 assert index.get_nns_by_item(0, 5) == loaded_index.get_nns_by_item(0, 5) index.unload() loaded_index.unload()
def baseline_train(olddata, f, trees): """" olddata to train with using f number of features of the data and building an index with trees number of trees """ t = AnnoyIndex(f) # Length of item vector that will be indexed if (os.path.isfile(saving_model)): print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..." t.load(saving_model) else: print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..." for i in olddata.index: v = list(olddata.ix[i, ['latitude', 'longitude', 'time_period']]) t.add_item(i, v) print "Building the trees..." t.build(trees) assert t.get_n_items() == olddata.shape[0] print "Saving the model..." t.save(saving_model) # Can easily be loaded into memory later. return (t)
def baseline_train(olddata, f, trees): """" olddata to train with using f number of features of the data and building an index with trees number of trees """ t = AnnoyIndex(f) # Length of item vector that will be indexed if os.path.isfile(saving_model): print "Loading in a pre-made, large read-only data structure we previously made with training data to use for approximate nearest neighbors on holdout data..." t.load(saving_model) else: print "Creating a large read-only data structure with training data to use for approximate nearest neighbors on holdout data..." for i in olddata.index: v = list(olddata.ix[i, ["latitude", "longitude", "time_period"]]) t.add_item(i, v) print "Building the trees..." t.build(trees) assert t.get_n_items() == olddata.shape[0] print "Saving the model..." t.save(saving_model) # Can easily be loaded into memory later. return t
def main(): t = AnnoyIndex(99, metric='euclidean') lines = dict() lookup = dict() prompt_word = input("Get the nearest semantic neighbors of: ") prompt_vec = find_glove_vector(prompt_word) print("loading...") index = 0 for row in open("semantic_vectors_weighted91.txt"): spl = row.find("@@@") line = row[0:spl - 1].lower() vec = row[spl + 3:-1] vals = np.array([float(val) for val in vec.split(", ")]) if line in lookup: continue t.add_item(index, vals) lines[index] = line lookup[line] = [index] index += 1 if index % 50000 == 0: print(line) print("{0} vectors loaded".format(index)) last_index = index + 1 t.add_item( last_index, prompt_vec) #add input vector so its neighbors can be calculated lookup[prompt_word] = [last_index] lines[last_index] = prompt_word t.build(100) print("done.") print("Num dict items: {0}".format(len(lookup))) print("Num list items: {0}".format(len(lines))) print("Num index items: {0}".format(t.get_n_items())) try: vec = prompt_vec print(nn_lookup(t, vec)) print([lines[i[0]] for i in nn_lookup(t, vec)]) except KeyError: print("not found")
def build_index(annoy_vector_dimension, embedding_fun, batch_size, sentences, content_array, stop_words, content_index): ann = AnnoyIndex(annoy_vector_dimension, metric='angular') batch_sentences = [] batch_indexes = [] last_indexed = 0 num_batches = 0 content = '' with tf.compat.v1.Session() as sess: sess.run([ tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer() ]) for sindex, sentence in enumerate(content_array): content = sentence[content_index] if stop_words: content = remove_stopwords(sentence[1]) batch_sentences.append(content) batch_indexes.append(sindex) if len(batch_sentences) == batch_size: context_embed = sess.run( embedding_fun, feed_dict={sentences: batch_sentences}) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) batch_sentences = [] batch_indexes = [] last_indexed += batch_size if num_batches % 10000 == 0: print_with_time('sindex: {} annoy_size: {}'.format( sindex, ann.get_n_items())) num_batches += 1 if batch_sentences: context_embed = sess.run(embedding_fun, feed_dict={sentences: batch_sentences}) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) return ann
def load(self): self.prev_id = -1 self.indexes = [] logger.info("Loading index {0}".format(self.actor_urn)) for index in self.indexes: index.unload() for f in sorted(listdir(self.index_dir)): if f.endswith(".ann"): self.index_files.append(join(self.index_dir,f)) index = AnnoyIndex(self.feat_size, metric='euclidean') index.load(join(self.index_dir, f)) self.indexes.append(index) self.prev_id += index.get_n_items() elif f.endswith('saved_state'): self.mem_store = np.load(join(self.index_dir, f)).tolist() logger.info("Loaded {0} files with total {1} records for index {2}" .format(len(self.indexes), self.prev_id + 1, self.actor_urn))
def annotate_all_questions(): embeddings = load_embeddings() sentence_ids = load_sentence_ids() index = AnnoyIndex(get_embeddings_dim(embeddings), "angular") index.load("index.ann") print("Found {} items in the index.".format(index.get_n_items())) print("The index uses {} trees.".format(index.get_n_trees())) print("") df = pd.concat(map(dm.ALLEN_AI_OBQA, list(OBQAType))) annotations = {} for _, row in tqdm.tqdm(df.iterrows(), total=len(df)): for answer in row.answers: sent = row.question + " " + answer closest = index.get_nns_by_vector(embeddings[sent], 75) closest = list(map(lambda sid: sentence_ids[sid], closest)) annotations[sent] = closest pickle.dump(annotations, open("annotations.pkl", "wb")) print("Annotations written to annotations.pkl")
def main(): args = setup_args() print_with_time(args) start_time = time.time() ann = AnnoyIndex(D) ann.load(args.ann) end_time = time.time() print('Load Time: {}'.format(end_time - start_time)) print_with_time('Annoy Index: {}'.format(ann.get_n_items())) start_time = time.time() sentences = load_sentences(args.sentences) end_time = time.time() print_with_time('Sentences: {} Time: {}'.format(len(sentences), end_time - start_time)) start_time = time.time() embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/1") sentences_ph = tf.placeholder(dtype=tf.string, shape=[None]) embedding_fun = embed(sentences_ph) sess = tf.Session() sess.run([tf.global_variables_initializer(), tf.tables_initializer()]) end_time = time.time() print_with_time('Ready! TF setup time: {}'.format(end_time - start_time)) while True: input_sentence = input('Enter sentence: ').strip() if input_sentence == 'q': return print_with_time('Input Sentence: {}'.format(input_sentence)) start_time = time.time() sentence_vector = sess.run(embedding_fun, feed_dict={sentences_ph:[input_sentence]}) print_with_time('vec done') nns = ann.get_nns_by_vector(sentence_vector[0], args.k) end_time = time.time() print_with_time('nns done: Time: {}'.format(end_time-start_time)) similar_sentences = [sentences[nn] for nn in nns] for sentence in similar_sentences: print(sentence)
class AnnoyClient: DIMENSION = 100 def __init__(self, index_file: str, id_list: List[str]): print('Initializing AnnoyIndex...') self.index = AnnoyIndex(self.DIMENSION, 'angular') self.index.load(index_file) self.id_list = id_list print('Done') def search(self, query: List[float], n: int = 100) -> List[dict]: items = self.index.get_nns_by_vector(query, n, include_distances=False) print(items) return [{ 'id': self.id_list[i], 'rank': r + 1 } for (r, i) in enumerate(items)] def get_total_count(self) -> int: return self.index.get_n_items()
def build_save_ann_from_iter_lookup(sentence_id_iter, lookup_fun, ann_file, num_trees=10, log_freq=1000, batch_size=32, encoder=None): if not encoder: encoder = USEEncoder() ann = AnnoyIndex(encoder.dim()) sentences = [] sentences_ids = [] for sentence_id in sentence_id_iter: sentence = lookup_fun[sentence_id] sentence = sentence.strip() sentences.append(sentence) sentences_ids.append(sentence_id) if len(sentences) == batch_size: vectors = encoder.encode(sentences) for vector, sid in zip(vectors, sentences_ids): ann.add_item(sid, vector) sentences = [] sentences_ids = [] if ann.get_n_items() % (batch_size * log_freq) == 0: logging.info(f'Indexed: {ann.get_n_items()}') if sentences: vectors = encoder.encode(sentences) for vector, sid in zip(vectors, sentences_ids): ann.add_item(sid, vector) logging.info(f'Final Indexed: {ann.get_n_items()}') ann.build(num_trees) ann.save(ann_file) return ann
def build_index(batch_size, content_array, model_url, random_projection_matrix): VECTOR_LENGTH = 512 if random_projection_matrix is not None: VECTOR_LENGTH = 64 ann = AnnoyIndex(VECTOR_LENGTH, metric=METRIC) batch_sentences = [] batch_indexes = [] last_indexed = 0 num_batches = 0 for sindex, sentence in enumerate(content_array): # sentence_embedding = generate_embeddings(sentence[1], model_url, random_projection_matrix) # ann.add_item(sindex, sentence_embedding[0]) batch_sentences.append(sentence[1]) batch_indexes.append(sindex) if len(batch_sentences) == batch_size: context_embed = generate_embeddings(batch_sentences, model_url, random_projection_matrix) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) batch_sentences = [] batch_indexes = [] last_indexed += batch_size if num_batches % 10000 == 0: print_with_time('sindex: {} annoy_size: {}'.format(sindex, ann.get_n_items())) num_batches += 1 if batch_sentences: context_embed = generate_embeddings(batch_sentences, model_url, random_projection_matrix) for index in batch_indexes: ann.add_item(index, context_embed[index - last_indexed]) return ann
def main(): t = AnnoyIndex(200, metric='euclidean') lines = list() lookup = dict() print("loading...") index = 0 for row in open("phonetic_vectors_every2_d200_reformatted.txt"): spl = row.find("@@@") line = row[0:spl - 1] stripped_line = line[2:-1].lower() #skip the b'' vec = row[spl + 3:-1] vals = np.array([float(val) for val in vec.split(", ")]) if stripped_line in lookup: continue lookup[stripped_line] = index lines.append(stripped_line) t.add_item(index, vals) index += 1 if index % 50000 == 0: print(stripped_line.lower()) print("{0} vectors loaded".format(index)) t.build(100) print("done.") print("Num dict items: {0}".format(len(lookup))) print("Num list items: {0}".format(len(lines))) print("Num index items: {0}".format(t.get_n_items())) try: vec = lookup["skating on thin ice"] print(vec) print(t.get_item_vector(vec)) print(nn_lookup(t, t.get_item_vector(vec))) print([lines[i[0]] for i in nn_lookup(t, t.get_item_vector(vec))]) except KeyError: print("not found")
def r_neighbors(shape_features: {}, db_features: {}, r=s.RNN_RANGE) -> []: """ It determines the closest shape to the query shape by computing R-Nearest Neighbors on a N-dimensional Approximate Nearest Neighbors feature mapping. ---------------------------- Args: shape_features (obj: 'dict'): The dictionary containing the feature metrics of the shape db_features (obj: 'dict): The dictionary containing the feature metrics of the shapes r (int): The distance range, the default value specified in Settings Returns: neighbors (obj: 'dict'): The dictionary containing the closest shapes (key) and the respective distance to the query shape (value) """ ann = AnnoyIndex(56, 'euclidean') # 56 features for id, featureList in db_features.items(): features_flatten = flatten_features_array(featureList) ann.add_item(id, features_flatten) shape_features_flat = flatten_features_array(shape_features) # To get the neighbors, it is necessary to add the new item to the mapping first shape_id = ann.get_n_items() ann.add_item(shape_id, shape_features_flat) ann.build(s.CATEGORIES) neighbors = ann.get_nns_by_item(shape_id, 200, include_distances=True) range_neighbors = ([], []) for i, distance in enumerate(neighbors[1]): if distance < r: range_neighbors[0].append(neighbors[0][i]) range_neighbors[1].append(distance) return range_neighbors
class Memory: def __init__(self, capacity, state_dim, value_dim): self.capacity = capacity print("state_dim:", state_dim) self.states = np.zeros((capacity, state_dim)) self.values = np.zeros((capacity, value_dim)) self.curr_capacity = 0 self.curr_ = 0 self.lru = np.zeros(capacity) self.tm = 0 self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index = AnnoyIndex(state_dim) self.index.set_seed(123) self.update_size = 1 self.build_capacity = 0 def sample_knn_test(self, state, k): inds, dists = self.index.get_nns_by_vector(state, k, include_distances=True) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample_knn(self, states, k): dists = [] inds = [] for state in states: ind, dist = self.index.get_nns_by_vector(state, k, include_distances=True) inds.append(ind) dists.append(dist) # inds = np.reshape(np.array(inds), -1) self.tm += 0.01 self.lru[inds] = self.tm return self.states[inds], self.values[inds], dists def sample(self, n_samples): if self.curr_capacity < n_samples or n_samples == 0: idx = np.random.choice(np.arange(len(self.states)), n_samples, replace=False) else: idx = np.random.choice(np.arange(self.curr_capacity), n_samples, replace=False) self.tm += 0.01 self.lru[idx] = self.tm embs = self.states[idx] values = self.values[idx] return embs, values def add_knn(self, states, values): self._add_knn(states, values) def add_knn_lru(self, states, values): self._add_knn(states, values, lru=True) def add(self, states, values): self._add(states, values) def add_lru(self, states, values): self._add(states, values, lru=True) def add_rand(self, states, values): self._add(states, values, rand=True) def _insert(self, states, values, indices): self.cached_states = self.cached_states + states self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.update_size: self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): self.states[ind] = self.cached_states[i] self.values[ind] = self.cached_values[i] self.index.add_item(ind, self.cached_states[i]) self.index.build(50) self.build_capacity = self.curr_capacity self.cached_states = [] self.cached_values = [] self.cached_indices = [] def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.build_capacity = self.curr_capacity def _add_knn(self, states, values, lru=False): # print(states) indices = [] states_ = [] values_ = [] for i, _ in enumerate(states): if lru: if self.curr_capacity >= self.capacity: ind = np.argmin(self.lru) else: ind = self.curr_capacity self.curr_capacity += 1 else: if self.curr_capacity >= self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity ind = self.curr_ else: ind = self.curr_capacity self.curr_capacity += 1 self.lru[ind] = self.tm indices.append(ind) states_.append(states[i]) values_.append(values[i]) self._insert(states_, values_, indices) def _add(self, states, values, rand=False, lru=False): for i, state in enumerate(states): if self.curr_capacity < self.capacity: self.curr_ = (self.curr_ + 1) % self.capacity # self.states[self.curr_] = state # self.values[self.curr_] = values[i] if self.curr_capacity < self.capacity: self.curr_capacity += 1 else: if lru: self.curr_ = np.argmin(self.lru) if rand: self.curr_ = np.random.choice(np.arange( self.curr_capacity), 1, replace=False) if not lru and not rand: self.curr_ = (self.curr_ + 1) % self.capacity self.states[self.curr_] = state self.values[self.curr_] = values[i] @property def length(self): # assert self.index.get_n_items() == self.curr_capacity # return self.curr_capacity return self.index.get_n_items()
class NearSentence(object): def __init__(self, fn_word, model_name, model_path): self.model = QueryModel(fn_word, model_name, model_path) self.queries = [] self.titles = [] self.query_index = 0 self.title_index = 0 self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean') self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean') def load_queries(self, fn_query, column): print '[In load_queries] Load candidate queries' sentences = [] chunk = [] vecs = [] with open(fn_query) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) < column: continue chunk.append(ll[column - 1]) if len(chunk) == 1000: vec, valid_sentence = self.model.get_query_vec(chunk) vec = vec / np.sqrt(np.sum(vec**2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) chunk = [] if len(chunk) > 0: vec, valid_sentence = self.model.get_query_vec(chunk) vecs.extend(list(vec)) sentences.extend(valid_sentence) print '[In load_queries] Build query annoy tree' for s, v in izip(sentences, vecs): self.queries.append(s) # if vecs == [0] * self.vectorizer.dim: # continue self.query_ann.add_item(self.query_index, v) self.query_index += 1 self.query_ann.build(10) print '[In load_queries] Size of tree =', self.query_ann.get_n_items() def load_titles(self, fn_title, column): print '[In load_titles] Load candidate titles' sentences = [] chunk = [] vecs = [] with open(fn_title) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) < column: continue chunk.append(ll[column - 1]) if len(chunk) == 1000: vec, valid_sentence = self.model.get_title_vec(chunk) vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) chunk = [] if len(chunk) > 0: vec, valid_sentence = self.model.get_title_vec(chunk) vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) print '[In load_titles] Build titles annoy tree, size =', len(vecs) for s, v in izip(sentences, vecs): self.titles.append(s) self.title_ann.add_item(self.title_index, v) # v is a list self.title_index += 1 self.title_ann.build(10) print '[In load_titles] Size of tree =', self.title_ann.get_n_items() def get_k_nearest_query(self, query, k): if isinstance(query, unicode): query = query.encode('utf8') cut_data = text_cutter.process({'title': query}) cut_query = cut_data['cut_title'].decode('utf8') vecs, valid_queries= self.model.get_query_vec([cut_query]) if len(valid_queries) == 0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] k_neighbors, scores = self.query_ann.get_nns_by_vector(vec, n=k, include_distances=True) neighbors = [] for i in k_neighbors: neighbors.append(self.queries[i]) return sorted(zip(neighbors, scores), key=lambda x: x[-1]) # def sim(self, u, v): # norm_u = u / np.sqrt(np.sum(u ** 2, keepdims=True)) # norm_v = u /np.sqrt(np.sum(v ** 2, keepdims=True)) # return np.dot(norm_u, norm_v) def get_k_nearest_title(self, title, k): if isinstance(title, unicode): title = title.encode('utf8') cut_data = text_cutter.process({'title': title}) title = cut_data['cut_title'].decode('utf8') vecs, valid_titles = self.model.get_title_vec([title]) if len(valid_titles) == 0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True) neighbors = [] for i in k_neighbors: neighbors.append(self.titles[i]) return sorted(zip(neighbors, scores), key=lambda x: x[-1]) def get_answers(self, query, k): if isinstance(query, unicode): query = query.encode('utf8') cut_data = text_cutter.process({'title': query}) cut_query = cut_data['cut_title'].decode('utf8') vecs, valid_queries = self.model.get_query_vec([cut_query]) if len(valid_queries)==0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] # recall titles according to cosine similarity candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True) # rank candidate titles using model candidate_titles = [] for i in candidate_titles_index: candidate_titles.append(self.titles[i]) ranks = self.model.rank_titles(cut_query, candidate_titles)[:k] return ranks def process(self, data): res = {} if 'titles' in data: res['title_nns'] = self.get_k_nearest_title(data['titles'], 10) if 'queries' in data: res['query_nns'] = self.get_k_nearest_query(data['queries'], 10) return json.dumps(res, ensure_ascii=False).encode('utf8')