class ImageSearchAnnoyCombo: ''' load an Annoy index for approximate nearest neighbor computation Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v)) ''' def __init__(self,h5fname = 'X_ILSVRC2015.hdf5',annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt',dset = 'fc6fc7'): #load h5 data h5f = h5py.File(h5fname,'r') self.X = h5f[dset] #load filenames with open(imageListPath,'r') as f: self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)} self.A = AnnoyIndex(self.X.shape[1],'angular') self.A.load(annf) def run_query_approx(self,query,n=100,accuracy_factor = 5): nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True) return zip((self.line_to_file[i] for i in nearest),scores) def run_query_exact(self,query,n=1000,nsmall=100): #retrieve approximate nearest neighbors using Annoy, then do exact ranking by loading from h5 into memory #use Annoy if n < nsmall: n = nsmall indexes = self.A.get_nns_by_vector(query, n, search_k=-1, include_distances=False) indexes_sorted = sorted(indexes) #use scipy cdist (or normalize first and do dot product for faster computation) #getting X by index from disc is very slow. distance = (cdist(self.X[indexes_sorted], query.reshape((1,query.shape[0])), 'cosine'))[:,0] ind = np.argpartition(distance, nsmall)[:nsmall]#partial sort, indices for top n, s_ind = np.argsort(distance[ind])#sort nearest = ind[s_ind] scoresorted = distance[ind][s_ind] return zip((self.line_to_file[indexes_sorted[i]] for i in nearest),scoresorted)
def test1_add_item_1(self): print "test_set_root" os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 2, "test_db", 10, 1000, 3048576000, 0) #i.verbose(True) i.create() for k in range(10): i.display_node(k) i.add_item(0, [0, 0, 1]) print "after adding 1 data" for k in range(10): i.display_node(k) i.add_item(1, [0, 1, 0]) print "after adding 2 data" for k in range(10): i.display_node(k) i.add_item(2, [1, 0, 0]) print "after adding 3 data" for k in range(100): print "node %d" % k i.display_node(k) print "get nns by vector [3,2,1]" self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
def test_get_nns_by_vector(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) self.assertEqual(i.get_nns_by_vector([4, 4], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 1], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([4, 2], 3), [1, 2, 0])
def test_get_nns_by_vector(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i.build(10) self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
def test_get_nns_by_vector(self): print "test_get_nns_by_vector " os.system("rm -rf test_db") os.system("mkdir test_db") f = 3 i = AnnoyIndex(f, 3, "test_db", 10, 1000, 3048576000, 0) i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) self.assertEqual(i.get_nns_by_vector([3, 2, 1], 3), [2, 1, 0]) self.assertEqual(i.get_nns_by_vector([1, 2, 3], 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_vector([2, 0, 1], 3), [2, 0, 1])
def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5): # the best movie/variable name total_recall = 0. for r in range(n_rounds): # create random points at distance x f = 10 idx = AnnoyIndex(f, 'dot') data = numpy.array([ [random.gauss(0, 1) for z in range(f)] for j in range(n_points) ]) expected_results = [ sorted( range(n_points), key=lambda j: dot_metric(data[i], data[j]) )[:n] for i in range(n_points) ] for i, vec in enumerate(data): idx.add_item(i, vec) idx.build(n_trees) for i in range(n_points): nns = idx.get_nns_by_vector(data[i], n) total_recall += recall(nns, expected_results[i]) return total_recall / float(n_rounds * n_points)
def test_single_vector(self): # https://github.com/spotify/annoy/issues/194 a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.build(10) a.save('1.ann') self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
def test_overwrite_index(self): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('test.ann') # Load index file t2 = AnnoyIndex(f) t2.load('test.ann') # Overwrite index file t3 = AnnoyIndex(f) for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == 'nt': # Can't overwrite on Windows with self.assertRaises(IOError): t3.save('test.ann') else: t3.save('test.ann') # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] nns = t2.get_nns_by_vector(v, 1000) # Should not crash
def retrieve(self): print 'Loading necessary files..' u = AnnoyIndex(self.dim, metric='angular') u.load(index_file) print 'ANN Retrieval..' for n_neighbors in knns: print 'Number of neighbors: ' + str(n_neighbors) for mult in self.multipliers: print 'Multiplier: ' + str(mult) search_k = self.n_trees * n_neighbors * mult filename = '.'.join((self.test_file.split('/')[-1].split('.')[:-1])) with open(self.test_file, 'r') as data_file: data = json.load(data_file) qArray = [] for i in range(len(data["questions"])): question_body = data["questions"][i]["body"] question_id = data["questions"][i]["id"] qcentroid = np.transpose(np.array(get_centroid_idf(question_body, self.emb, self.idf, self.stopwords, self.dim))) anns = u.get_nns_by_vector(qcentroid, n_neighbors, search_k) doc_anns = [] for n in anns: doc_anns.append(self.idmap[n]) q = Question(question_body, question_id, doc_anns) qArray.append(q) directory = "system_results/" if not os.path.exists(directory): os.makedirs(directory) with open(str(directory)+"/"+"CentIDF_annoy_"+str(n_trees)+"_"+str(n_neighbors)+"_"+str(mult)+".json", "w+") as outfile: outfile.write(json.dumps({"questions":[ob.__dict__ for ob in qArray]}, indent=2))
def test_many_vectors(self): f = 10 i = AnnoyIndex(f, 'hamming') for x in range(100000): i.add_item(x, numpy.random.binomial(1, 0.5, f)) i.build(10) rs, ds = i.get_nns_by_vector([0]*f, 10000, include_distances=True) self.assertGreaterEqual(min(ds), 0) self.assertLessEqual(max(ds), f) dists = [] for x in range(1000): rs, ds = i.get_nns_by_vector(numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True) dists.append(ds[0]) avg_dist = 1.0 * sum(dists) / len(dists) self.assertLessEqual(avg_dist, 0.42)
def test_get_nns_by_vector(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2,2]) i.add_item(1, [3,2]) i.build(10) self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
class FeatureNN: tree = None def __init__(self, features, tree_file): self.tree = AnnoyIndex(features, metric='euclidean') self.tree.load(str(tree_file)) def nn(self, x): return self.tree.get_nns_by_vector(x.tolist(), 1)[0]
def get_rank(uid): """Returns a list of the 10 best ranked items for a user This function generates a rank of items for a given user by using Approximate Nearest Neighbours. The algorithm is imported from the Annoy library (developed by Spotify). Todo: The index is built from scratch everytime the function is called, which definitely should be changed in the future for increased performance. It should be fairly easy to do as ANNOY can store indexes in files which can easily been shared by processes. However, it works well with a few hundred items as it is now. item_queue: It is a list of item ids for each user. It acts as a circular queue for keeping track of which items the user has seen so far. When two new items are shown to the user, they are placed in the back of the queue. Args: uid (int): User ID Returns: List of item ids (str) """ ann = AnnoyIndex(data_dimension) try: items = db.items.find() q = db.users.find({"uid": uid}, {"item_queue" : 1, "_id": 0})[0]["item_queue"] except TypeError: print "Unable to fetch user from DB" ids = [i["vid"] for i in q ] # Following line can be deleted or modified. # It removes the last 15 items from the ANN tree, so they will never be recommended # for the user. This is done to make sure the user only sees new items in the # recommended list (assuming 15 is the number of comparisons the user has made). # This is sort of a hack and can be removed/modified later on if necessary. ids[-15:] = [] print ids id_dict = {} # Add items to ANN tree for i,item in enumerate(items): if item["vid"] in ids: # Store all ids in a dictionary id_dict[str(i)] = item["vid"] ann.add_item(i, item["vals"]) # Erik Bernhardson (aurthor of ANNOY) suggests to use 2*dimension of data as the number # of trees to build. ann.build(data_dimension*2) try: user = db.users.find({"uid": uid})[0] except TypeError: print "Unable to fetch user from DB" # Get 10 highest ranked items for that user nns_tmp = ann.get_nns_by_vector(user["vals"],10) nns = [id_dict[str(k)] for k in nns_tmp] print nns return nns
def test_only_one_item(self): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100) idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 1) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
def main(args): """ Main entry. """ data = Dataset(args.dataset) f = data.base.shape[1] for ntrees in args.ntrees: t = AnnoyIndex(f) # Length of item vector that will be indexed idxpath = os.path.join(args.exp_dir, 'sift_annoy_ntrees%d.idx' % ntrees) if not os.path.exists(idxpath): logging.info("Adding items ...") for i in xrange(data.nbae): t.add_item(i, data.base[i]) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") logging.info("Building indexes ...") t.build(ntrees) logging.info("\tDone!") t.save(idxpath) else: logging.info("Loading indexes ...") t.load(idxpath) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): ids[i, :] = np.array(t.get_nns_by_vector(data.query[i], args.topk)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-ntrees_%s\n" % ("Annoy", ntrees)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
class ANN: def __init__(self, dimension): self.ann = AnnoyIndex(dimension) def addVectors(self,vectors): for idx,v in enumerate(vectors): self.ann.add_item(idx,v) self.ann.build(10) def query(self,vector): match = self.ann.get_nns_by_vector(vector,1)[0] # return self.ann.get_item_vector(match),match return match def save(self): self.ann.save("analogies.ann") def load(self,filename): self.ann.load(filename)
class ImageSearchAnnoy: ''' load an Annoy index for approximate nearest neighbor computation Annoy's angular distance uses dist(u,v) = 2(1-cos(u,v)) ''' def __init__(self,dimensions,annf='ILSVRC2015.ann',imageListPath = '/home/scratch/benediktb/RegionOfInterest/ILSVRC2015_filelist.txt'): #load filenames with open(imageListPath,'r') as f: #self.line_to_file = {i:line.split('/')[-1].rstrip() for i,line in enumerate(f)} self.line_to_file = {i:line.rstrip() for i,line in enumerate(f)} self.A = AnnoyIndex(dimensions,'angular') self.A.load(annf) def run_query(self,query,n=100,accuracy_factor = 2): nearest,scores = self.A.get_nns_by_vector(query, n, search_k=n*int(accuracy_factor)*128, include_distances=True) return zip((self.line_to_file[i] for i in nearest),scores)
def test_random_holes(self): f = 10 index = AnnoyIndex(f) valid_indices = random.sample(range(2000), 1000) # leave holes for i in valid_indices: v = numpy.random.normal(size=(f,)) index.add_item(i, v) index.build(10) for i in valid_indices: js = index.get_nns_by_item(i, 10000) for j in js: self.assertTrue(j in valid_indices) for i in range(1000): v = numpy.random.normal(size=(f,)) js = index.get_nns_by_vector(v, 10000) for j in js: self.assertTrue(j in valid_indices)
def precision(self, n, n_trees=10, n_points=10000): # create random points at distance x f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): p = [random.gauss(0, 1) for z in xrange(f)] norm = sum([pi ** 2 for pi in p]) ** 0.5 x = [pi / norm * j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([0] * f, n) self.assertEquals(nns, sorted(nns)) # should be in order # The number of gaps should be equal to the last item minus n-1 found = len([x for x in nns if x < n]) return 1.0 * found / n
def RunAnnAnnoy(): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) # Parse options. if not "k" in options: Log.Fatal("Required option: Number of furthest neighbors to find.") return -1 else: k = int(options.pop("k")) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) return -1 if not "num_trees" in options: Log.Fatal("Required option: Number of trees to build") return -1 else: n = int(options.pop("num_trees")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors acc = 0 t = AnnoyIndex(train.shape[1]) for i in range(len(train)): t.add_item(i,train[i]) t.build(n) for i in range(len(queryData)): v = t.get_nns_by_vector(queryData[i],k) except Exception as e: Log.Info(e) return -1 time = totalTimer.ElapsedTime() return time
class SimilarStringStore: def __init__(self, **kwargs): self.transformer = FeatureGenerator(k=1) print(self.transformer.n_features) self.store = AnnoyIndex(self.transformer.n_features) def vectorize(self, s): return self.transformer.transform(s) def add(self, id, s): ''' add a string to index ''' vector = self.transformer.transform(s) self.store.add_item(int(id), vector) return vector def build(self): self.store.build(500) def save(self, filename='store.knn'): self.store.save(filename) def build_and_save(self, filename='store.knn'): self.build() self.save(filename) def load(self, filename='store.knn'): self.store.load(filename) def query(self, s): ''' query index ''' vector = self.transformer.transform(s) neighbors = self.store.get_nns_by_vector(vector, 40) return neighbors def remove(self, id): ''' remove a string from the index ''' pass
def test_get_nns_with_distances(self): f = 3 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) self.assertEqual(l, [0, 1, 2]) self.assertAlmostEqual(d[0]**2, 0.0) self.assertAlmostEqual(d[1]**2, 2.0) self.assertAlmostEqual(d[2]**2, 5.0) l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True) self.assertEqual(l, [1, 0, 2]) self.assertAlmostEqual(d[0]**2, 6.0) self.assertAlmostEqual(d[1]**2, 8.0) self.assertAlmostEqual(d[2]**2, 9.0)
def precision(self, n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in xrange(n_rounds): # create random points at distance x f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): p = [random.gauss(0, 1) for z in xrange(f)] norm = sum([pi**2 for pi in p])**0.5 x = [pi / norm * j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([0] * f, n) self.assertEqual(nns, sorted(nns)) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds)
def nn_approx(ds1, ds2, knn=KNN, metric='manhattan', n_trees=10): # Build index. a = AnnoyIndex(ds2.shape[1], metric=metric) for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((a, b_i)) return match
def precision(self, n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in xrange(n_rounds): os.system("rm -rf test_db") os.system("mkdir test_db") # create random points at distance x from (1000, 0, 0, ...) f = 10 i = AnnoyIndex(f, 10, "test_db", n_trees, 1000, 3048576000, 0) for j in xrange(n_points): p = [random.gauss(0, 1) for z in xrange(f - 1)] norm = sum([pi ** 2 for pi in p]) ** 0.5 x = [1000] + [pi / norm * j for pi in p] i.add_item(j, x) nns = i.get_nns_by_vector([1000] + [0] * (f-1), n) self.assertEqual(nns, sorted(nns)) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds)
def predict_annoy(descriptors): u = AnnoyIndex(config.index_descriptor_length, config.index_annoydist) u.load(config.reference_index_path) # super fast, will just mmap the file from annoytest import get_sheet_for_id, sheets votes = {k: 0 for k in sheets} for desc in descriptors: # will find the k nearest neighbors NN_ids = u.get_nns_by_vector( desc, config.index_k_nearest_neighbours, include_distances=True) # will find the n nearest neighbors distances = NN_ids[1] NN_ids = NN_ids[0] if config.index_lowes_test_ratio: if min(distances) < config.index_lowes_test_ratio * max(distances): # good match NN_ids = [NN_ids[0]] else: continue NN_names = [get_sheet_for_id(i) for i in NN_ids] # vote for the nearest neighbours (codebook response) for name in NN_names: if config.index_voting_scheme == "antiprop": votes[name] += 1 / (NN_names.index(name) + 1 ) # antiproportional weighting else: # todo: allow other voting schemes in config raise NotImplementedError( "voting scheme '%s' not implemented" % config.index_voting_scheme) if votes == {}: print("truth not in index") return -1 votes = sorted(votes.items(), key=lambda x: x[1], reverse=True) # print("truth:",class_label_truth,"index:",[x[0] for x in votes].index(class_label_truth)) return votes # most similar prediction is in 0th position
def AnnoyInfer(filename, ids, vec_len): u = AnnoyIndex(vec_len, metric='euclidean') u.load('item_title_vec.ann') for line in open(filename): sp = line.strip().split('\t') vec = vec_len * [0] idx = 0 for f in sp[2].strip().split(','): vec[idx] = float(f) idx = idx + 1 list, score = u.get_nns_by_vector(vec, 100, 1000, include_distances=True) res = [] j = 0 length = len(list) while (j < length): res.append(ids[list[j]] + "," + str(score[j])) j = j + 1 idx = idx + 1 print (sp[0] + "\t" + ",".join(res))
def SMOTE(X, k, oversample_times, aknn_positive): # load AnnoyIndex feature_dim = X.shape[1] index = AnnoyIndex(feature_dim) index.load(aknn_positive) # generate synthetic examples X_new = [] for i in range(X.shape[0]): x = X[i] knn = list( set(index.get_nns_by_vector(x, k+1)) - set([i]) ) # get NNs excluding the element itself for j in range(int(oversample_times)): x2 = X[knn[np.random.randint(k)]] x_new = x + (x2-x)*np.random.rand() X_new.append(x_new) if np.random.rand()<=(oversample_times-int(oversample_times)): # dealing with fractions x2 = X[knn[np.random.randint(k)]] x_new = x + (x2-x)*np.random.rand() X_new.append(x_new) X_new = np.asarray(X_new, dtype=np.float) X = np.concatenate((X, X_new)) return X
def testTrain(testL, testI, trainL, trainI): u = AnnoyIndex(784) u.load('test.ann') sumCorrect = 0 for x in xrange(len(testI)): if x % 100 == 0: print(x) vec = [] for y in testI[x]: for z in y: vec.append(z) guess = u.get_nns_by_vector(vec, 1) while isinstance(guess, list): guess = guess[0] if trainL[guess] == testL[x]: sumCorrect = sumCorrect + 1 else: print("wrong! {} != {}".format(trainL[guess], testL[x])) print("{}/10000 correct!".format(sumCorrect)) return sumCorrect
def predict_type_embed(types_embed_array: np.array, types_embed_labels: np.array, indexed_knn: AnnoyIndex, k: int) -> List[dict]: """ Predict type of given type embedding vectors """ pred_types_embed = [] pred_types_score = [] for i, embed_vec in enumerate( tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) pred_idx_scores = compute_types_score(dist, idx, types_embed_labels) pred_types_embed.append([i for i, s in pred_idx_scores]) pred_types_score.append(pred_idx_scores) return pred_types_embed, pred_types_score
def label_approx(X, sites, site_labels): from annoy import AnnoyIndex assert (X.shape[1] == sites.shape[1]) # Build index over site points. aindex = AnnoyIndex(sites.shape[1], metric='euclidean') for i in range(sites.shape[0]): aindex.add_item(i, sites[i, :]) aindex.build(10) labels = [] for i in range(X.shape[0]): # Find nearest site point. nearest_site = aindex.get_nns_by_vector(X[i, :], 1) if len(nearest_site) < 1: labels.append(None) continue labels.append(site_labels[nearest_site[0]]) return np.array(labels)
class face_annoy: def __init__(self): self.f = int(face_comm.get_conf('annoy','face_vector')) self.annoy_index_path = os.path.abspath(face_comm.get_conf('annoy','index_path')) self.lmdb_file =os.path.abspath(face_comm.get_conf('lmdb','lmdb_path')) self.num_trees =int(face_comm.get_conf('annoy','num_trees')) self.annoy = AnnoyIndex(self.f) if os.path.isfile(self.annoy_index_path): self.annoy.load(self.annoy_index_path) #从lmdb文件中建立annoy索引 def create_index_from_lmdb(self): # 遍历 lmdb_file = self.lmdb_file if os.path.isdir(lmdb_file): evn = lmdb.open(lmdb_file) wfp = evn.begin() annoy = AnnoyIndex(self.f) for key, value in wfp.cursor(): key = int(key) print(type(value)) value = np.fromstring(value,dtype=np.float32) print(value.shape) annoy.add_item(key,value) annoy.build(self.num_trees) annoy.save(self.annoy_index_path) #重新加载索引 def reload(self): self.annoy.unload() self.annoy.load(self.annoy_index_path) #根据人脸特征找到相似的 def query_vector(self,face_vector): n=int(face_comm.get_conf('annoy','num_nn_nearst')) print(face_vector.shape) return self.annoy.get_nns_by_vector(face_vector,n,include_distances=True)
class AnnoyIndexing: """ Index features by AnnoyIndex. Parameters ---------- path_index : list path to index files. (.ann) distance_type : str distance type for index, ex: euclidean ... length :int length feature vector """ def __init__(self, **kwargs): self.length = kwargs.pop('length', False) if not self.length: raise Exception( "AnnoyIndexing() missing 1 required positional argument: length" ) self.distance_type = kwargs.pop('distance_type', False) if not self.distance_type: raise Exception( "AnnoyIndexing() missing 1 required positional argument: distance_type" ) self.path_index = kwargs.pop('path_index', False) if not self.path_index: raise Exception( "AnnoyIndexing() missing 1 required positional argument: path_index" ) self.index = AnnoyIndex(self.length, self.distance_type) self.index.load(self.path_index) def get_knn(self, feature, k): ''' Get k results from index ''' vector_k = self.index.get_nns_by_vector(feature, k, include_distances=True) return vector_k
def create_collage(input_image, profile_name, version_count): """ given an input image and an existing profile, create a set of new collages """ profile_folder = PROFILES_DIRECTORY + profile_name + "/" if not os.path.exists(OUTPUT_DIRECTORY): os.makedirs(OUTPUT_DIRECTORY) # todo: load feature dimensions from profile nns_index = AnnoyIndex(SAMPLE_DIMENSION[0]*SAMPLE_DIMENSION[1], metric="euclidean") print("loading trees...") nns_index.load(profile_folder + profile_name + ".tree") print("done.") subimage_index = pickle.load( open(profile_folder + profile_name + ".p", "rb")) template_image = Image.open(input_image) image_width, image_height = template_image.size[0], template_image.size[1] crop_width, crop_height = subimage_index[-1]["crop_width"], subimage_index[-1]["crop_height"] for i in xrange(version_count): print("Creating collage {}/{}...").format(i+1, version_count) output_image = template_image.copy() for x in xrange(0, image_width-crop_width, crop_width): for y in xrange(0, image_height-crop_height, crop_height): box = (x, y, x + crop_width, y + crop_height) crop_box = output_image.crop(box) crop_sample = crop_box.convert("LA").resize(SAMPLE_DIMENSION) gs_pixeldata = [] for pixel in list(crop_sample.getdata()): gs_pixeldata.append(pixel[0]) image_neighbor = nns_index.get_nns_by_vector(gs_pixeldata, version_count)[i] substitute_image = Image.open(subimage_index[image_neighbor]["image"]) substitute_crop = substitute_image.crop( subimage_index[image_neighbor]["box"]) output_image.paste(substitute_crop, box) output_path = OUTPUT_DIRECTORY + str(i) + ".png" output_image.save(output_path, "PNG") print("done.") print("{} image(s) saved in {}").format( version_count, OUTPUT_DIRECTORY) return
def predict_type_embed_task(types_embed_array: np.array, types_embed_labels: np.array, type_space_labels: np.array, pred_task_idx: tuple, indexed_knn: AnnoyIndex, k: int) -> List[dict]: def find_pred_task(i: int): if i < pred_task_idx[0]: return 'Parameter' elif i < pred_task_idx[1]: return 'Return' else: return 'Variable' pred_types: List[dict] = [] # pred_types_embed = [] # pred_types_score = [] for i, embed_vec in enumerate( tqdm(types_embed_array, total=len(types_embed_array), desc="Finding KNNs & Prediction")): idx, dist = indexed_knn.get_nns_by_vector(embed_vec, k, include_distances=True) pred_idx_scores = compute_types_score(dist, idx, type_space_labels) pred_types.append({ 'original_type': types_embed_labels[i], 'predictions': pred_idx_scores, 'task': find_pred_task(i), 'is_parametric': bool(re.match(r'(.+)\[(.+)\]', types_embed_labels[i])) }) # pred_types_embed.append([i for i, s in pred_idx_scores]) # pred_types_score.append(pred_idx_scores) return pred_types
def log_topk_retrieval_acc(self, engine): """ For tracking the performance during training top K Precision """ train_embs, train_labels = extract_embeddings(self.model, self.train_loader) val_embs, val_labels = extract_embeddings(self.model, self.val_loader) emb_dim = train_embs.shape[1] # ---------------------------------- t = AnnoyIndex(emb_dim, metric='euclidean') n_trees = 100 for i, emb_vec in enumerate(train_embs): t.add_item(i, emb_vec) # build a forest of trees t.build(n_trees) # ---------------------------------- top_k_corrects = dict() # Meassure Prec@[5, 10, 20, 30] for i, emb_vec in enumerate(val_embs): correct_cls = val_labels[i] for k in [5, 10, 20, 30]: idx = t.get_nns_by_vector(emb_vec, k) top_k_classes = train_labels[idx] correct = np.sum(top_k_classes == correct_cls) accum_corr = top_k_corrects.get(k, 0) top_k_corrects[k] = accum_corr + correct # ------------------------------------------------- # calculate back the acc top_k_acc = dict() for k in [5, 10, 20, 30]: top_k_acc[k] = top_k_corrects[k] / k / val_embs.shape[0] tqdm.write( "Top K Retrieval Results - Epoch: {} Avg top-k accuracy:".format( engine.state.epoch)) for k in [5, 10, 20, 30]: tqdm.write(" Prec@{} = {:.2f}".format(k, top_k_acc[k]))
def search_csv(csv_path='/midata/private/journal/files.csv', query='Misima island port harbor derelict ship PNG Papua New Guinnea Australis harbor storm sailing cliffs anchor drag', num_results=10, num_dims=300): df = pd.read_csv(csv_path, index_col=0) index_path = os.path.join(os.path.dirname(csv_path), 'files_index.ann') index = AnnoyIndex(f=num_dims) index.load(index_path) vec = nlp(query).vector paths = [] for i in index.get_nns_by_vector(vec, num_results): path = df.iloc[i]['path'] paths.append(path) print(path) with open(path, 'rb') as fin: bintext = b''.join(fin.readlines()[:10]) try: text = bintext.decode() except UnicodeDecodeError: text = bintext.decode('latin') print(text) print('-' * 120) return
def nearest_approx(X, sites): from annoy import AnnoyIndex assert (X.shape[1] == sites.shape[1]) # Build index over site points. aindex = AnnoyIndex(sites.shape[1], metric='manhattan') for i in range(sites.shape[0]): aindex.add_item(i, sites[i, :]) aindex.build(max(10, int(np.log2(X.shape[0])))) site_to_idx = {site_idx: [] for site_idx in range(sites.shape[0])} for idx in range(X.shape[0]): # Find nearest site point. nearest_sites = aindex.get_nns_by_vector(X[idx, :], 1) if len(nearest_sites) < 1: continue site_idx = nearest_sites[0] site_to_idx[site_idx].append(idx) return site_to_idx
def feat_match(descs1, descs2): # Your Code Here n1, points1 = descs1.shape[:2] n2, points2 = descs2.shape[:2] t = AnnoyIndex(n1, metric="euclidean") for i in range(points2): t.add_item(i, descs2[:, i]) t.build(50) matches = np.zeros((points1), dtype=int) for i in range(points1): p_index, dist = t.get_nns_by_vector(descs1[:, i], 2, include_distances=True) if dist[0] / dist[1] < 0.6: matches[i] = p_index[0] else: matches[i] = -1 return matches
class KNNIndex(object): annoy = None vec_len = -1 metric = 'euclidean' is_loaded = False def __init__(self, vec_len, metric='euclidean', index_file=None): self.vec_len = vec_len self.metric = metric self.annoy = AnnoyIndex(self.vec_len, self.metric) if index_file: self.load(index_file) def get_nns_by_item(self, i, n, search_k=-1, include_distances=False): if self.is_loaded: return self.annoy.get_nns_by_item(i, n, search_k, include_distances) else: raise RuntimeError("Annoy index file is not loaded!") def get_nns_by_vector(self, v, n, search_k=-1, include_distances=False, n_propagation=0): if self.is_loaded: return self.annoy.get_nns_by_vector(v, n, search_k, include_distances) else: raise RuntimeError("Annoy index file is not loaded!") def load(self, index_file): self.annoy.load(index_file) self.is_loaded = True def unload(self): self.annoy.unload() self.is_loaded = False
class VisBotTextBrain: def __init__(self, model_file, annoy_file): print("Loading w2v model...") self.model = KeyedVectors.load("./data/" + model_file) # Word2Vec.load self.annoy = AnnoyIndex(self.model.wv.vector_size) print("Loading Annoy...") self.annoy.load("./data/" + annoy_file) def run(self, request=None): """ :param: request: str value :return: list of indexes in DB """ request_list = normalize_text(request).split(' ') if (len(request_list) == 1) and (request_list[0] == ''): raise ValueError('Incorrect request') vect_repr = self._get_vect_representation(request_list) self.request = request self.request_vector = vect_repr return self.annoy.get_nns_by_vector(vect_repr, n=100) def _get_vect_representation(self, request_list): vect_repr = [] for word in request_list: try: vect_repr.append(self.model.wv[word]) except KeyError: vect_repr.append([0] * 300) if vect_repr: vect_repr = np.mean(np.array(vect_repr), axis=0) else: vect_repr = np.array([0] * 300) return vect_repr
def get_best_match(self, pred_vects, lbl_features, labels, k, expand=True, num_trees=100): '''' Goal: get 2 closest for every label if expand = True - Avoid using closest index in image vectors for another label else: Can get same index for another label Method to get match. A little faster than original code ''' t = AnnoyIndex(len(lbl_features[0])) for i in range(len(pred_vects)): t.add_item(i, pred_vects[i]) t.build(num_trees) image_feature_inds = [] final_lbl_featues = [] temp = 0 used_inds = {} for x in range(len(lbl_features)): if expand == False: temp = 2 else: temp += k indices = t.get_nns_by_vector(lbl_features[x], temp, include_distances=False) add = 0 for ind in indices: if (add > k): break if temp == len(pred_vects): # Edge case. If every index is used. final_lbl_featues.append(lbl_features[x]) image_feature_inds.append(indices[0]) return image_feature_inds, final_lbl_featues if ind in used_inds and expand: continue else: if ind not in used_inds: used_inds[ind] = 1 #label_inds.append(x) final_lbl_featues.append(lbl_features[x]) image_feature_inds.append(ind) add += 1 return image_feature_inds, final_lbl_featues
class face_annoy: def __init__(self): self.f = 512 self.annoy_index_path = os.path.abspath( os.path.expanduser('~') + "/acs/data/face_vector.nn") self.num_trees = 100 self.annoy = AnnoyIndex(self.f) if os.path.isfile(self.annoy_index_path): self.annoy.load(self.annoy_index_path) # 从lmdb文件中建立annoy索引 def create_index_from_lmdb(self): # 遍历 # lmdb_file = self.lmdb_file rows = dbsql.getallem() if len(rows) > 0: annoy = AnnoyIndex(self.f) for row in rows: key = row[0] value = str2embed(row[1]) annoy.add_item(key, value) annoy.build(self.num_trees) annoy.save(self.annoy_index_path) # 重新加载索引 def reload(self): self.annoy.unload() self.annoy.load(self.annoy_index_path) # 根据人脸特征找到相似的 def query_vector(self, face_vector): n = 1 return self.annoy.get_nns_by_vector(face_vector, n, include_distances=True)
def find_article_by_text(self, title, annotation, key_words, checkbox): data_storage = { i[0]: i[1]['title'] + ' ' + i[1]['annotation'] for i in self.data.iterrows() } map_id_2_prod_hash = pkl.load( open('map_id_to_hash_products.dict', 'rb')) index_title_emb = AnnoyIndex(100) index_title_emb.load('./annoy') model = Word2Vec.load('./w2v_products.w2v_gensim') app.logger.info('Запрос' + title) listik = (self.normalize_text(title) + ' ' + self.normalize_annotation(annotation) + ' ' + self.normalize_key_words(key_words)).split(' ') vec = np.zeros(100) for i in listik: part_of_vec = None try: part_of_vec = model[i] except KeyError: pass if part_of_vec is not None: vec += part_of_vec annoy_res = list( index_title_emb.get_nns_by_vector(vec, 13, include_distances=True)) app.logger.info('Соседи:') listik = [] for annoy_id, annoy_sim in itertools.islice(zip(*annoy_res), 13): image_id = map_id_2_prod_hash[annoy_id] listik.append(image_id) app.logger.info(data_storage[image_id], 1 - annoy_sim**2 / 2) return self.data.ix[listik]
def classify_cells(args, data_pt, all_sims_timepoints, ann_dir): n_neighbors = 10 meta = data_pt["meta"] yc = data_pt["celltype"] xp_df = pd.DataFrame(data_pt["xp"], yc) u = AnnoyIndex( all_sims_timepoints[0][0].shape[1], 'euclidean') # all_sims_timepoints[0][0][0].shape[1], 'euclidean') u.load(ann_dir) yp_all = [] for timepoint in all_sims_timepoints: yp = [] for i in range(len(timepoint)): yt = [] for j in range(len(timepoint[0])): nn = xp_df.iloc[u.get_nns_by_vector(timepoint[i][j], n_neighbors)] nn = Counter(nn.index).most_common(2) label, num = nn[0] yt.append(label) yp.append(yt) yp_all.append(yp) return yp_all
class Annoy(AnnBase): def __init__(self, vector_len: int, metric: str = 'angular', **kwargs): super().__init__(**kwargs) self.index = AnnoyIndex(vector_len, metric=metric) def build_index(self, num_trees: int = 30): for i, embed in enumerate(self.data): self.index.add_item(i, embed) self.index.build(num_trees) def search_vec_top_n(self, vector, n: int = 5): neighbours = self.index.get_nns_by_vector(vector, n) result = [] for idx in neighbours: result.append(self.mapping[idx]) return result def _load_file(self, path: str, **kwargs): self.index.load(path) def _save_file(self, path: str): self.index.save(path)
def label_approx(X, sites, site_labels, k=1): from annoy import AnnoyIndex assert (X.shape[1] == sites.shape[1]) # Build index over site points. aindex = AnnoyIndex(sites.shape[1], metric='euclidean') for i in range(sites.shape[0]): aindex.add_item(i, sites[i, :]) aindex.build(10) labels = [] for i in range(X.shape[0]): # Find nearest site point. nearest_sites = aindex.get_nns_by_vector(X[i, :], k) if len(nearest_sites) < 1: labels.append(None) continue label = Counter([site_labels[ns] for ns in nearest_sites]).most_common(1)[0][0] labels.append(label) return np.array(labels)
def thingR(): a = request.args.get('url') + "?REQUEST=" + request.args.get( 'REQUEST' ) + "&TIME=" + request.args.get('TIME') + "&BBOX=" + request.args.get( 'BBOX') + "&CRS=" + request.args.get( 'CRS') + "&LAYERS=" + request.args.get( "LAYERS") + "&WRAP=" + request.args.get( "WRAP") + "&FORMAT=" + request.args.get( 'FORMAT') + "&WIDTH=" + request.args.get( 'WIDTH') + "&HEIGHT=" + request.args.get( 'HEIGHT') + "&ts=" + request.args.get('ts') u = AnnoyIndex(128) u.load('imageFeatSuhas.ann') indexes = u.get_nns_by_vector(eF(a, model2), 30, include_distances=True) j = "" for a in indexes[0]: p = a // 320 o = a % 320 j += ( "https://gibs.earthdata.nasa.gov/wmts/epsg4326/best/MODIS_Terra_CorrectedReflectance_TrueColor/default/2012-07-09/250m/8/" + str(p + 100) + "/" + str(o) + ".jpg###") return j
def nn_annoy(ds1, ds2, names1, names2, knn = 20, metric='euclidean', n_trees = 50, save_on_disk = True): """ Assumes that Y is zero-indexed. """ # Build index. a = AnnoyIndex(ds2.shape[1], metric=metric) if(save_on_disk): a.on_disk_build('annoy.index') for i in range(ds2.shape[0]): a.add_item(i, ds2[i, :]) a.build(n_trees) # Search index. ind = [] for i in range(ds1.shape[0]): ind.append(a.get_nns_by_vector(ds1[i, :], knn, search_k=-1)) ind = np.array(ind) # Match. match = set() for a, b in zip(range(ds1.shape[0]), ind): for b_i in b: match.add((names1[a], names2[b_i])) return match
def ranking(): with open('./docs/index_sen_20000.json','r') as file: index_sen = json.load(file) f = 768 t = AnnoyIndex(768,'euclidean') t.load('./docs/sentence_embedding_20000_200.ann') query = input('Enter your sentence here') print('Top 3 Related Questions:') encoder = SentenceTransformer('bert-base-nli-mean-tokens') query_vector = encoder.encode([query]) query_vector_ls = query_vector[0].tolist() output = t.get_nns_by_vector(query_vector_ls,3,search_k =3, include_distances=True) output_sen1 = index_sen[str(output[0][0])] print(output_sen1) output_sen2 = index_sen[str(output[0][1])] print(output_sen2) output_sen3 = index_sen[str(output[0][2])] print(output_sen3)
def find_nn(self, near_neigh=50): dim = joint_poses_btw_posture_in_plane_by_joint_pos( self.motions[0][0], self.motions[0][0]).shape[0] size = sum(map(len, self.motions)) self.distance = np.zeros((size, size)) data = [] for i in range(len(self.motions)): for j in range(len(self.motions[i])): data.append( joint_poses_btw_posture_in_plane_by_joint_pos( self.motions[i][j], self.motions[0][0])) t = AnnoyIndex(dim, metric='euclidean') for i in range(size): t.add_item(i, data[i]) t.build(20) for i in range(size): res, dist = t.get_nns_by_vector(data[i], near_neigh, include_distances=True) for j in range(near_neigh): if abs(i - res[j]) > 10: self.distance[i, res[j]] = dist[j] # TODO: self.add_transition( MotionTransition(0, i, 0, res[j], dist[j])) # print(i, res[j], dist[j]) self.transition.sort() for i in range(len(self.transition)): t = self.transition[i] print('check: ', t.motion_from_idx, t.motion_to_idx)
def get_mind_recall_res(user_embs, doc_embs, user_idx_2_rawid, doc_idx_2_rawid, topk): """近邻检索,这里用annoy tree""" # 把doc_embs构建成索引树 f = user_embs.shape[1] t = AnnoyIndex(f, 'angular') for i, v in enumerate(doc_embs): t.add_item(i, v) t.build(10) # 可以保存该索引树 t.save('annoy.ann') # 每个用户向量, 返回最近的TopK个item user_recall_items_dict = collections.defaultdict(dict) for i, u in enumerate(user_embs): recall_doc_scores = t.get_nns_by_vector(u, topk, include_distances=True) # recall_doc_scores是(([doc_idx], [scores])), 这里需要转成原始doc的id raw_doc_scores = list(recall_doc_scores) raw_doc_scores[0] = [doc_idx_2_rawid[i] for i in raw_doc_scores[0]] # 转换成实际用户id try: user_recall_items_dict[user_idx_2_rawid[i]] = dict( zip(*raw_doc_scores)) except: continue # 默认是分数从小到大排的序, 这里要从大到小 user_recall_items_dict = { k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in user_recall_items_dict.items() } # 保存一份 pickle.dump(user_recall_items_dict, open('mind_u2i_dict.pkl', 'wb')) return user_recall_items_dict
class Annoy(ANN): """ Builds an ANN model using the Annoy library. """ def load(self, path): # Load index self.model = AnnoyIndex(self.config["dimensions"], self.config["metric"]) self.model.load(path) def index(self, embeddings): # Inner product is equal to cosine similarity on normalized vectors self.config["metric"] = "dot" # Create index self.model = AnnoyIndex(self.config["dimensions"], self.config["metric"]) # Add items for x in range(embeddings.shape[0]): self.model.add_item(x, embeddings[x]) # Build index self.model.build(10) def search(self, query, limit): # Run the query ids, scores = self.model.get_nns_by_vector(query, n=limit, include_distances=True) # Map results to [(id, score)] return list(zip(ids, scores)) def save(self, path): # Write index self.model.save(path)
def get_celeb_prediction(img, ann_filepath, celeb_mapping_path): ann_index = AnnoyIndex(2048, 'angular') _ = ann_index.load(ann_filepath) encs, bbox = get_encoding_new(img) data = [] if encs is not None: for index, enc in enumerate(encs): cv2.rectangle(img, bbox[index], (255, 0, 0), 2) temp_data = {} temp_data["bbox"] = bbox[index] results = ann_index.get_nns_by_vector(enc[0], 10, search_k=-1, include_distances=True) dist_threshold = 0.9 celeb_count_dict = get_celeb_name_from_id(results, celeb_mapping_path, dist_threshold) distance = results[1][0] if len(celeb_count_dict) != 0 and max( celeb_count_dict.values()) > 3: celeb_name = max(celeb_count_dict, key=celeb_count_dict.get) cv2.putText(img, celeb_name.upper(), (bbox[index][0] - 5, bbox[index][1] - 5), cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1) temp_data["celeb_name"] = celeb_name temp_data["confidence"] = face_distance_to_conf(distance) else: temp_data["celeb_name"] = "unknown" temp_data["confidence"] = 0.0 data.append(temp_data) img = imutils.resize(img, width=400) # display(img) return data, img else: return None, None
print 'building index...' ai.build(10) print 'building up data points' lons = np.arange(-180, 180, 0.25) lats = np.arange(-90, 90, 0.25) X, Y = np.meshgrid(lons, lats) Z = np.zeros(X.shape) count = 0 for i, _ in np.ndenumerate(Z): lon, lat = X[i], Y[i] v = ll_to_3d(lat, lon) js = ai.get_nns_by_vector(v, 50) all_ts = [ts[j] for j in js] cutoff = np.percentile(all_ts, 90) p = np.mean([t for t in all_ts if t < cutoff]) p = np.clip(p, vmin, vmax) Z[i] = p count += 1 if count % 1000 == 0: print count, np.prod(Z.shape) print 'plotting' maps = [ ('nyc', (20, 20), basemap.Basemap(projection='ortho',lat_0=30,lon_0=-30,resolution='l')), ('asia', (20, 20), basemap.Basemap(projection='ortho',lat_0=23,lon_0=105,resolution='l')), ('world', (20, 10), basemap.Basemap(projection='cyl', llcrnrlat=-60,urcrnrlat=80,\ llcrnrlon=-180,urcrnrlon=180,resolution='c'))
class AnnoyIndexer(object): def __init__(self, model=None, num_trees=None): self.index = None self.labels = None self.model = model self.num_trees = num_trees if model and num_trees: if isinstance(self.model, Doc2Vec): self.build_from_doc2vec() elif isinstance(self.model, Word2Vec): self.build_from_word2vec() else: raise ValueError("Only a Word2Vec or Doc2Vec instance can be used") def save(self, fname, protocol=2): fname_dict = fname + '.d' self.index.save(fname) d = {'f': self.model.vector_size, 'num_trees': self.num_trees, 'labels': self.labels} with smart_open(fname_dict, 'wb') as fout: _pickle.dump(d, fout, protocol=protocol) def load(self, fname): fname_dict = fname+'.d' if not (os.path.exists(fname) and os.path.exists(fname_dict)): raise IOError( "Can't find index files '%s' and '%s' - Unable to restore AnnoyIndexer state." % (fname, fname_dict)) else: with smart_open(fname_dict) as f: d = _pickle.loads(f.read()) self.num_trees = d['num_trees'] self.index = AnnoyIndex(d['f']) self.index.load(fname) self.labels = d['labels'] def build_from_word2vec(self): """Build an Annoy index using word vectors from a Word2Vec model""" self.model.init_sims() return self._build_from_model(self.model.wv.syn0norm, self.model.wv.index2word , self.model.vector_size) def build_from_doc2vec(self): """Build an Annoy index using document vectors from a Doc2Vec model""" docvecs = self.model.docvecs docvecs.init_sims() labels = [docvecs.index_to_doctag(i) for i in range(0, docvecs.count)] return self._build_from_model(docvecs.doctag_syn0norm, labels, self.model.vector_size) def _build_from_model(self, vectors, labels, num_features): index = AnnoyIndex(num_features) for vector_num, vector in enumerate(vectors): index.add_item(vector_num, vector) index.build(self.num_trees) self.index = index self.labels = labels def most_similar(self, vector, num_neighbors): """Find the top-N most similar items""" ids, distances = self.index.get_nns_by_vector( vector, num_neighbors, include_distances=True) return [(self.labels[ids[i]], 1 - distances[i] / 2) for i in range(len(ids))]
print 'layer_size=',layer_size t.build(ntrees) t.save('index.ann') s.close() if args.search: s = shelve.open('data.bin') u = AnnoyIndex(s['layer_size'],metric) u.load('index.ann') data = [args.search] classif = dd.post_predict(sname,data,parameters_input,parameters_mllib,parameters_output) # search for every roi res = classif['body']['predictions'][0]['rois'] print('number of ROI in query: ' + str(len(res))) for roi in res: near = u.get_nns_by_vector(roi['vals'],args.search_size,include_distances=True) near_data = [] near_distance = [] for n in near[1]: near_distance.append(n) print('distances: ') print(near_distance) for n in near[0]: near_data.append(s[str(n)]) # print query bbox img = cv2.imread(args.search) bbox = roi['bbox'] cat = roi['cat'] cv2.rectangle(img, (int(bbox['xmin']),int(bbox['ymax'])),(int(bbox['xmax']),int(bbox['ymin'])),(255,0,0),2) cv2.putText(img,cat,(int(bbox['xmin']),int(bbox['ymax'])),cv2.FONT_HERSHEY_PLAIN,1,255)
from annoy import AnnoyIndex import random f = 40 t = AnnoyIndex(f) # Length of item vector that will be indexed for i in xrange(1000): v = [random.gauss(0, 1) for z in xrange(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f) u.load('test.ann') # super fast, will just mmap the file print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors item = u.get_item_vector(0) print(u.get_nns_by_vector(item, 1000)) # will find the 1000 nearest neighbors #print(len(u.get_nns_by_vector(item, 1000))) #print(len(set(u.get_nns_by_vector(item, 1000)))) #print(len(u.get_nns_by_item(0, 1000))) #print(len(set(u.get_nns_by_item(0, 1000)))) #if u.get_nns_by_vector(item, 1000) == u.get_nns_by_item(0, 1000): # print("SAME\n")
from annoy import AnnoyIndex a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.add_item(1, [0, 1, 0]) a.add_item(2, [0, 0, 1]) a.build(-1) print a.get_nns_by_item(0, 100) print a.get_nns_by_vector([1.0, 0.5, 0.5], 100)