def test_get_lots_of_nns(self): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) i.build(10) for j in xrange(100): self.assertEqual(i.get_nns_by_item(0, 999999999), [0])
def test_write_failed(self): f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if sys.platform == "linux" or sys.platform == "linux2": # linux try: t.save("/dev/full") self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) elif sys.platform == "darwin": volume = "FULLDISK" device = os.popen('hdiutil attach -nomount ram://64').read() os.popen('diskutil erasevolume MS-DOS %s %s' % (volume, device)) os.popen('touch "/Volumes/%s/full"' % volume) try: t.save('/Volumes/%s/annoy.tree' % volume) self.fail("didn't get expected exception") except Exception as e: self.assertTrue(str(e).find("No space left on device") > 0) finally: os.popen("hdiutil detach %s" % device)
def test_overwrite_index(self): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('test.ann') # Load index file t2 = AnnoyIndex(f) t2.load('test.ann') # Overwrite index file t3 = AnnoyIndex(f) for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == 'nt': # Can't overwrite on Windows with self.assertRaises(IOError): t3.save('test.ann') else: t3.save('test.ann') # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] nns = t2.get_nns_by_vector(v, 1000) # Should not crash
def precision(f=40, n=1000000): t = AnnoyIndex(f) for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) print 'finding nbs for', j closest = set(t.get_nns_by_item(j, n)[:k]) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print 'limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))
def recall_at(self, n, n_trees=10, n_points=1000, n_rounds=5): # the best movie/variable name total_recall = 0. for r in range(n_rounds): # create random points at distance x f = 10 idx = AnnoyIndex(f, 'dot') data = numpy.array([ [random.gauss(0, 1) for z in range(f)] for j in range(n_points) ]) expected_results = [ sorted( range(n_points), key=lambda j: dot_metric(data[i], data[j]) )[:n] for i in range(n_points) ] for i, vec in enumerate(data): idx.add_item(i, vec) idx.build(n_trees) for i in range(n_points): nns = idx.get_nns_by_vector(data[i], n) total_recall += recall(nns, expected_results[i]) return total_recall / float(n_rounds * n_points)
def test_single_vector(self): # https://github.com/spotify/annoy/issues/194 a = AnnoyIndex(3) a.add_item(0, [1, 0, 0]) a.build(10) a.save('1.ann') self.assertEquals(a.get_nns_by_vector([1, 0, 0], 3, include_distances=True), ([0], [0.0]))
def make_text_graph(user_lemma_matrix, dimensionality, metric, number_of_estimators, number_of_neighbors): user_lemma_matrix_tfidf = augmented_tf_idf(user_lemma_matrix) # print(user_lemma_matrix_tfidf.shape) if (user_lemma_matrix_tfidf.shape[0] <= dimensionality) or (user_lemma_matrix_tfidf.shape[1] <= dimensionality): X_svd = user_lemma_matrix_tfidf.toarray() else: X_svd = TruncatedSVD(n_components=dimensionality).fit_transform(user_lemma_matrix_tfidf) annoy_index = AnnoyIndex(X_svd.shape[1], metric=metric) for q in range(X_svd.shape[0]): annoy_index.add_item(q, X_svd[q, :]) annoy_index.build(number_of_estimators) row = list() col = list() data = list() for q in range(X_svd.shape[0]): neighbors, distances = annoy_index.get_nns_by_item(q, number_of_neighbors, include_distances=True) row.extend([q] * number_of_neighbors) col.extend(neighbors) data.extend(distances) row = np.array(row, dtype=np.int64) col = np.array(col, dtype=np.int64) data = np.array(data, dtype=np.float64) text_graph = spsp.coo_matrix((data, (row, col)), shape=(X_svd.shape[0], X_svd.shape[0])) text_graph = spsp.csr_matrix(text_graph) return text_graph
def test_zero_vectors(self): # Mentioned on the annoy-user list bitstrings = [ '0000000000011000001110000011111000101110111110000100000100000000', '0000000000011000001110000011111000101110111110000100000100000001', '0000000000011000001110000011111000101110111110000100000100000010', '0010010100011001001000010001100101011110000000110000011110001100', '1001011010000110100101101001111010001110100001101000111000001110', '0111100101111001011110010010001100010111000111100001101100011111', '0011000010011101000011010010111000101110100101111000011101001011', '0011000010011100000011010010111000101110100101111000011101001011', '1001100000111010001010000010110000111100100101001001010000000111', '0000000000111101010100010001000101101001000000011000001101000000', '1000101001010001011100010111001100110011001100110011001111001100', '1110011001001111100110010001100100001011000011010010111100100111', ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, 'hamming') for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save('idx.ann') idx = AnnoyIndex(f, 'hamming') idx.load('idx.ann') js, ds = idx.get_nns_by_item(0, 5, include_distances=True) self.assertEquals(js[0], 0) self.assertEquals(ds[:4], [0, 1, 1, 22])
def _test_holes_base(self, n, f=100, base_i=100000): annoy = AnnoyIndex(f) for i in range(n): annoy.add_item(base_i + i, numpy.random.normal(size=(f,))) annoy.build(100) res = annoy.get_nns_by_item(base_i, n) self.assertEquals(set(res), set([base_i + i for i in range(n)]))
def _get_index(self, dataset): url = 'http://vectors.erikbern.com/%s.hdf5' % dataset vectors_fn = os.path.join('test', dataset + '.hdf5') index_fn = os.path.join('test', dataset + '.annoy') if not os.path.exists(vectors_fn): print('downloading', url, '->', vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn) distance = dataset_f.attrs['distance'] f = dataset_f['train'].shape[1] annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print('adding items', distance, f) for i, v in enumerate(dataset_f['train']): annoy.add_item(i, v) print('building index') annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f
def build_index(df,n_trees = 50,dist_metric='angular',out_dir="./"): n_records = df.shape[0] n_col = df.shape[1] index = AnnoyIndex(n_col,metric=dist_metric) patient_dict = {} index_dict = {} i = 0 print "Adding items to the index..." for patient_id in df.index.values: if i % 10000 == 0: print str(i) vec = df.loc[patient_id].values index.add_item(i,vec) patient_dict[patient_id] = i index_dict[i] = patient_id i += 1 print "Building the index..." index.build(n_trees) index.save(out_dir+"annoy_index.ann") ## Save the patient_id -> index mapping ## w = csv.writer(open(out_dir+"patient_mapping.csv", "w")) for key, val in patient_dict.items(): w.writerow([key, val]) w = csv.writer(open(out_dir+"index_mapping.csv", "w")) for key, val in index_dict.items(): w.writerow([key, val])
def build_annoy_index(corpus, dimension, winlen, winstep): print "Adding to Annoy index" index = AnnoyIndex(dimension, "euclidean") mfcc_list = [] i = 0 for filename, frames in corpus: # print filename, frames.shape for index_in_file, mfcc in enumerate(frames): mfcc_list.append((filename, index_in_file)) index.add_item(i, mfcc.tolist()) assert mfcc_list[i] == (filename, index_in_file) i += 1 opts = {"samplerate": desired_samplerate, "winlen": winlen, "winstep": winstep, "numcep": 13, "nfilt": 26, "nfft": 512, "ntrees": ANN_NTREES } cache_filename = "annoy_index_" + hashlib.md5(str([filename for filename, frames in corpus])).hexdigest() + "." + "_".join("%s=%s" % (k, v) for k, v in sorted(opts.items())) + ".tree" if not os.path.exists(cache_filename): print "Building Annoy index with %d trees" % ANN_NTREES # index.build(-1) index.build(ANN_NTREES) index.save(cache_filename) print "\tWrote cache to %s" % cache_filename else: print "\tReading cache from %s" % cache_filename index.load(cache_filename) return index, mfcc_list
def ANN(searchSpace): dimension = searchSpace[0].shape[0] t = AnnoyIndex(dimension, metric='euclidean') for i in range(len(searchSpace)): t.add_item(i, searchSpace[i]) t.build(10) return t
def build_tree(df, metric): ''' INPUTS: Pandas DataFrame, Choice of Metric Space String OUTPUTS: Returns the built AnnoyIndex tree, returns a dictionary mapping index numbers to the DataFrame's index Builds a ANN tree using Spotify's ANNoy library. Metric is the metric space (either euclidean or angular) ''' tree = AnnoyIndex(len(df.iloc[0, :].values), metric=metric) indexes = {} for i in xrange(len(df)): v = df.iloc[i, :] indexes[i] = v.name tree.add_item(i, v.values) tree.build(50) tree.save(DATA_DIR + 'tree_' + metric + '.ann') with open(DATA_DIR + 'indexes_' + metric, 'wb') as f: pickle.dump(indexes, f) return (tree, indexes)
def test_tuple(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): i.add_item(j, (random.gauss(0, 1) for x in xrange(f))) i.build(n_trees)
def test_no_items(self): idx = AnnoyIndex(100) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 0) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [])
def fit_annoy(data, n_trees=-1): logger.info('Fitting Annoy Matcher...') from annoy import AnnoyIndex matcher = AnnoyIndex(data.shape[1], metric='euclidean') for i, d in enumerate(data): matcher.add_item(i, d) matcher.build(n_trees) return matcher
def build_index(counts,label_to_id,dimension): index = AnnoyIndex(dimension,metric='angular') for label,cnt_list in counts.items(): id = label_to_id[label] index.add_item(id,cnt_list) index.build(100) return index
def test_get_nns_by_vector(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2,2]) i.add_item(1, [3,2]) i.build(10) self.assertEquals(i.get_nns_by_vector([3,3], 2), [1, 0])
def test_save_without_build(self): # Issue #61 i = AnnoyIndex(10) i.add_item(1000, [random.gauss(0, 1) for z in xrange(10)]) i.save('x.tree') j = AnnoyIndex(10) j.load('x.tree') j.build(10)
def test_wrong_length(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [random.gauss(0, 1) for x in xrange(f)]) self.assertRaises(IndexError, i.add_item, 1, [random.gauss(0, 1) for x in xrange(f+1000)]) self.assertRaises(IndexError, i.add_item, 2, []) i.build(n_trees)
def _build_from_model(self, vectors, labels, num_features): index = AnnoyIndex(num_features) for vector_num, vector in enumerate(vectors): index.add_item(vector_num, vector) index.build(self.num_trees) self.index = index self.labels = labels
def test_get_nns_by_vector(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [1,0,0]) i.add_item(1, [0,1,0]) i.add_item(2, [0,0,1]) i.build(10) self.assertEquals(i.get_nns_by_vector([3,2,1], 3), [0,1,2])
def create_index_tree(clusters): features = clusters.shape[1] tree = AnnoyIndex(features, metric='euclidean') for i, v in enumerate(clusters): tree.add_item(i, v.tolist()) tree.build(features*2) return tree
def test_include_dists_check_ranges(self): f = 3 i = AnnoyIndex(f) for j in xrange(100000): i.add_item(j, numpy.random.normal(size=f)) i.build(10) indices, dists = i.get_nns_by_item(0, 100000, include_distances=True) self.assertTrue(max(dists) < 2.0) self.assertAlmostEqual(min(dists), 0.0)
def test_numpy(self, n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, 'euclidean') for j in xrange(n_points): a = numpy.random.normal(size=f) a = a.astype(random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16])) i.add_item(j, a) i.build(n_trees)
def get_rank(uid): """Returns a list of the 10 best ranked items for a user This function generates a rank of items for a given user by using Approximate Nearest Neighbours. The algorithm is imported from the Annoy library (developed by Spotify). Todo: The index is built from scratch everytime the function is called, which definitely should be changed in the future for increased performance. It should be fairly easy to do as ANNOY can store indexes in files which can easily been shared by processes. However, it works well with a few hundred items as it is now. item_queue: It is a list of item ids for each user. It acts as a circular queue for keeping track of which items the user has seen so far. When two new items are shown to the user, they are placed in the back of the queue. Args: uid (int): User ID Returns: List of item ids (str) """ ann = AnnoyIndex(data_dimension) try: items = db.items.find() q = db.users.find({"uid": uid}, {"item_queue" : 1, "_id": 0})[0]["item_queue"] except TypeError: print "Unable to fetch user from DB" ids = [i["vid"] for i in q ] # Following line can be deleted or modified. # It removes the last 15 items from the ANN tree, so they will never be recommended # for the user. This is done to make sure the user only sees new items in the # recommended list (assuming 15 is the number of comparisons the user has made). # This is sort of a hack and can be removed/modified later on if necessary. ids[-15:] = [] print ids id_dict = {} # Add items to ANN tree for i,item in enumerate(items): if item["vid"] in ids: # Store all ids in a dictionary id_dict[str(i)] = item["vid"] ann.add_item(i, item["vals"]) # Erik Bernhardson (aurthor of ANNOY) suggests to use 2*dimension of data as the number # of trees to build. ann.build(data_dimension*2) try: user = db.users.find({"uid": uid})[0] except TypeError: print "Unable to fetch user from DB" # Get 10 highest ranked items for that user nns_tmp = ann.get_nns_by_vector(user["vals"],10) nns = [id_dict[str(k)] for k in nns_tmp] print nns return nns
def test_get_nns_by_item(self): f = 2 i = AnnoyIndex(f, 'euclidean') i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) self.assertEqual(i.get_nns_by_item(0, 3), [0, 1, 2]) self.assertEqual(i.get_nns_by_item(2, 3), [2, 1, 0])
def test_get_nns_by_item(self): f = 3 i = AnnoyIndex(f) i.add_item(0, [2,1,0]) i.add_item(1, [1,2,0]) i.add_item(2, [0,0,1]) i.build(10) self.assertEqual(i.get_nns_by_item(0, 3), [0,1,2]) self.assertEqual(i.get_nns_by_item(1, 3), [1,0,2])
def test_only_one_item(self): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100) idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save('foo.idx') idx = AnnoyIndex(100) idx.load('foo.idx') self.assertEquals(idx.get_n_items(), 1) self.assertEquals(idx.get_nns_by_vector(vector=numpy.random.randn(100), n=50, include_distances=False), [0])
class LRU_KNN: def __init__(self, capacity, key_dim, value_dim, batch_size): self.capacity = capacity self.curr_capacity = 0 self.states = np.zeros((capacity, key_dim)) self.values = np.zeros((capacity, value_dim)) self.lru = np.zeros(capacity) self.tm = 0.0 self.index = AnnoyIndex(key_dim, metric="euclidean") self.index.set_seed(123) self.initial_update_size = batch_size self.min_update_size = self.initial_update_size self.cached_states = [] self.cached_values = [] self.cached_indices = [] def nn(self, keys, k): dists = [] inds = [] for key in keys: ind, dist = self.index.get_nns_by_vector(key, k, include_distances=True) dists.append(dist) inds.append(ind) return dists, inds def query(self, keys, k): _, indices = self.nn(keys, k) states = [] values = [] for ind in indices: self.lru[ind] = self.tm states.append(self.states[ind]) values.append(self.values[ind]) self.tm += 0.001 return states, values def _insert(self, keys, values, indices): self.cached_states = self.cached_states + keys self.cached_values = self.cached_values + values self.cached_indices = self.cached_indices + indices if len(self.cached_states) >= self.min_update_size: self.min_update_size = max(self.initial_update_size, self.curr_capacity * 0.02) self._update_index() def _update_index(self): self.index.unbuild() for i, ind in enumerate(self.cached_indices): new_state = self.cached_states[i] new_value = self.cached_values[i] self.states[ind] = new_state self.values[ind] = new_value self.index.add_item(ind, new_state) self.cached_states = [] self.cached_values = [] self.cached_indices = [] self.index.build(50) self.built_capacity = self.curr_capacity def _rebuild_index(self): self.index.unbuild() for ind, state in enumerate(self.states[:self.curr_capacity]): self.index.add_item(ind, state) self.index.build(50) self.built_capacity = self.curr_capacity
k_strided = [fake.k[i] for i in range(0, len(fake.k), opt.stride)] for window_left in k_strided: window_right = window_left + opt.width if window_right > np.max(fake.k): break selection = ((fake.k >= window_left) & (fake.k <= window_right)) window_k = fake.k[selection] window_band_l = fake.E_lower[selection] window_band_u = fake.E_upper[selection] window_size = np.max(window_k) - np.min(window_k) gap = np.min(window_band_u) - np.max(window_band_l) window_band_l = interpolate_normalize(window_k, window_band_l, opt.dimensions) window_band_u = interpolate_normalize(window_k, window_band_u, opt.dimensions) #plt.plot(window_band_l) #plt.plot(window_band_u) #plt.title('k =' + str(window_left) + ' gap = '+ str( gap)) #plt.show() annoyindex.add_item(len(lookuptable), np.concatenate([window_band_l, window_band_u])) lookuptable.append([window_left, gap]) annoyindex.build(opt.trees) annoyindex.save('index_test.ann') np.save('lookuptable_test', lookuptable)
parser.add_argument('--file', help='Input file') parser.add_argument('--out', help='Outfile base') parser.add_argument('--L', help='Fingerprint length') parser.add_argument('--norm', help='Normalize') args = parser.parse_args() a = AnnoyIndex(int(args.L)) i = 0 names = [] with gzip.open(args.file, 'rt') as f: for line in f: id, statements, *v = line.split("\t") id = re.sub('.json.gz', '', id) id = re.sub('\.', '|', id) names.append(id) v = [float(j) for j in v] if args.norm: avg = statistics.mean(v) std = statistics.stdev(v) v = [(j - avg) / std for j in v] a.add_item(i, v) i = i + 1 a.build(-1) a.save(args.out + '.tree') with open(args.out + '.names', 'w') as f: for item in names: f.write("%s\n" % item)
def hard_mining_reset(self): #import faiss from annoy import AnnoyIndex data = nd.zeros(self.provide_data[0][1]) label = nd.zeros(self.provide_label[0][1]) #label = np.zeros( self.provide_label[0][1] ) X = None ba = 0 batch_num = 0 while ba < len(self.oseq): batch_num += 1 if batch_num % 10 == 0: print('loading batch', batch_num, ba) bb = min(ba + self.batch_size, len(self.oseq)) _count = bb - ba for i in range(_count): idx = self.oseq[i + ba] s = self.imgrec.read_idx(idx) header, img = recordio.unpack(s) img = self.imdecode(img) data[i][:] = self.postprocess_data(img) label[i][:] = header.label db = mx.io.DataBatch(data=(data, self.data_extra), label=(label, )) self.mx_model.forward(db, is_train=False) net_out = self.mx_model.get_outputs() embedding = net_out[0].asnumpy() nembedding = sklearn.preprocessing.normalize(embedding) if _count < self.batch_size: nembedding = nembedding[0:_count, :] if X is None: X = np.zeros((len(self.id2range), nembedding.shape[1]), dtype=np.float32) nplabel = label.asnumpy() for i in range(_count): ilabel = int(nplabel[i]) #print(ilabel, ilabel.__class__) X[ilabel] += nembedding[i] ba = bb X = sklearn.preprocessing.normalize(X) d = X.shape[1] t = AnnoyIndex(d, metric='euclidean') for i in range(X.shape[0]): t.add_item(i, X[i]) print('start to build index') t.build(20) print(X.shape) k = self.per_identities self.seq = [] for i in range(X.shape[0]): nnlist = t.get_nns_by_item(i, k) assert nnlist[0] == i for _label in nnlist: assert _label < len(self.id2range) _id = self.header0[0] + _label v = self.id2range[_id] _list = range(*v) if len(_list) < self.images_per_identity: random.shuffle(_list) else: _list = np.random.choice(_list, self.images_per_identity, replace=False) for i in range(self.images_per_identity): _idx = _list[i % len(_list)] self.seq.append(_idx)
log.debug(f'{df_click.head()}') article_vec_map = word2vec(df_click, 'user_id', 'click_article_id', model_path) f = open(w2v_file, 'wb') pickle.dump(article_vec_map, f) f.close() # 将 embedding 建立索引 article_index = AnnoyIndex(256, 'angular') article_index.set_seed(2020) for article_id, emb in tqdm(article_vec_map.items()): article_index.add_item(article_id, emb) article_index.build(100) user_item_ = df_click.groupby('user_id')['click_article_id'].agg( lambda x: list(x)).reset_index() user_item_dict = dict( zip(user_item_['user_id'], user_item_['click_article_id'])) # 召回 n_split = max_threads all_users = df_query['user_id'].unique() shuffle(all_users) total = len(all_users) n_len = total // n_split # 清空临时文件夹 for path, _, file_list in os.walk('../tmp/w2v'):
class PerCategoryTable: def __init__(self, db): self.db = db self.cfg = db.cfg self.cache_dir = db.cache_dir # def retrieve(self, query_vector, K=1): # if getattr(self, 'nntable') is None: # print('The NNTable has not been built, please run build_nntable first.') # return None # inds = self.nntable.get_nns_by_vector(query_vector, K, search_k=-1, include_distances=False) # inds = list(inds) # if len(inds) > 1: # patches = [] # for i in range(len(inds)): # patches.append(self.patchdb[inds[i]]) # return patches # else: # return self.patchdb[inds[0]] def retrieve(self, query_vector, K=1): # if getattr(self, 'nntable') is None: # print('The NNTable has not been built, please run build_nntable first.') # return None N = 10 inds = self.nntable.get_nns_by_vector(query_vector, N, search_k=-1, include_distances=False) inds = list(inds) tmp = np.random.permutation(range(N)) return self.patchdb[inds[tmp[0]]] def build_nntable(self, category_id, patchdb, use_cache=True): # keep a reference to the per-category patchdb self.patchdb = patchdb # cache output directories if self.cfg.use_patch_background: nntable_folder_name = self.db.split + '_nntables_with_bg' else: nntable_folder_name = self.db.split + '_nntables_without_bg' nntable_dir = osp.join(self.cache_dir, nntable_folder_name) maybe_create(nntable_dir) nntable_file = osp.join(nntable_dir, '%03d_nntable.ann'%category_id) # load or create the files if osp.exists(nntable_file) and use_cache: ################################################################# ## Load the files if possible ################################################################# self.nntable = AnnoyIndex(self.cfg.n_patch_features) self.nntable.load(nntable_file) else: ################################################################# ## create the cache files ################################################################# category = self.db.classes[category_id] print("%s NNTable"%category) t0 = time() self.nntable = AnnoyIndex(self.cfg.n_patch_features) for i in range(len(patchdb)): x = patchdb[i] image_index = x['image_index'] instance_ind = x['instance_ind'] feature_path = self.db.patch_path_from_indices(image_index, instance_ind, 'patch_feature', 'pkl', self.cfg.use_patch_background) with open(feature_path, 'rb') as fid: features = pickle.load(fid) self.nntable.add_item(i, features) n_trees = max(len(patchdb)//100, self.cfg.n_nntable_trees) self.nntable.build(n_trees) print("%s NNTable completes (time %.2fs)" % (category, time() - t0)) ##################################################################### ## Save cache files for faster loading in the future ##################################################################### self.nntable.save(nntable_file) print('wrote nntable to {}'.format(nntable_file))
for folder in os.listdir(base_url): celeb_encoding = {} celeb_mapping[folder] = [] for image in tqdm(listdir(base_url + '/' + folder)): try: encoding = get_encoding(os.path.join(base_url, folder, image)) except Exception as e: print(e) continue if encoding is not None: c += 1 celeb_encoding[c] = encoding[0] celeb_mapping[folder].append(c) ann_index.add_item(c, encoding[0]) save_json(celeb_mapping) pickle.dump(celeb_encoding, open(f"celeb_encodings/{folder}_encoding.pkl", "wb")) del celeb_encoding save_json(celeb_mapping) print("Encoding and mapping files saved successfully") print("Building ann index...") ann_index.build(1000) x = ann_index.save("celeb_index.ann") if x: print("Ann index saved successfully") else: print("Error in saving ann index")
def generate_negative_training_examples(self, df_positives_cases, df_wiki_1_vectors, df_wiki_2_vectors): # initialize data frame df_negative_examples = pd.DataFrame(columns=['entity_id_wiki_1', 'entity_id_wiki_2', 'vector_entity_1', 'vector_entity_2','label']) dict_wiki_1 = df_wiki_1_vectors['entity_id'].to_dict() index_map_wiki_1 = dict((v,k) for k,v in dict_wiki_1.items()) dict_wiki_2 = df_wiki_2_vectors['entity_id'].to_dict() index_map_wiki_2 = dict((v,k) for k,v in dict_wiki_2.items()) t_wiki_1 = None # Length of item vector that will be indexed t_wiki_2 = None if len(df_wiki_1_vectors) > 0: print('building index for entities in wiki 1') t_wiki_1 = AnnoyIndex(EMBEDDING_VECTOR_LENGTH, 'angular') for index, row in df_wiki_1_vectors.iterrows(): v = row['vector'] t_wiki_1.add_item(index, v) t_wiki_1.build(20) # 10 trees print('building index for entities in wiki 1 done') if len(df_wiki_2_vectors) > 0: print('building index for entities in wiki 2') t_wiki_2 = AnnoyIndex(EMBEDDING_VECTOR_LENGTH, 'angular') for index, row in df_wiki_2_vectors.iterrows(): v = row['vector'] t_wiki_2.add_item(index, v) t_wiki_2.build(20) # 10 trees print('building index for entities in wiki 2 done') print('total positive cases:', len(df_positives_cases)) df_positives_cases_slice = df_positives_cases.head(1000) for index, row in df_positives_cases_slice.iterrows(): #print(df_wiki_1_vectors.head()) #print(row['entity_id_wiki_1']) entity_1 = row['entity_id_wiki_1'] entity_2 = row['entity_id_wiki_2'] if entity_1 in index_map_wiki_1: filtered_entity_1 = df_wiki_1_vectors.loc[index_map_wiki_1[entity_1],:] #print(filtered_entity_1) if len(filtered_entity_1) > 0: vector = filtered_entity_1['vector'] nn_ent1 = t_wiki_2.get_nns_by_vector(vector, 5, include_distances=False) for i in range(0,len(nn_ent1)): df_negative_examples.loc[len(df_negative_examples)] = [entity_1] + [dict_wiki_2[i]] + [vector] + [t_wiki_2.get_item_vector(i)] +[0] #df_negative_examples = df_negative_examples.append({'entity_id_wiki_1': entity_1, 'entity_id_wiki_2': dict_wiki_2[i], 'vector_entity_1': vector ,'vector_entity_2': t_wiki_2.get_item_vector(i), 'label':0}, ignore_index=True) if entity_2 in index_map_wiki_2: filtered_entity_2 = df_wiki_2_vectors.loc[index_map_wiki_2[entity_2],:] if len(filtered_entity_2) > 0: vector = filtered_entity_2['vector'] nn_ent2 = t_wiki_1.get_nns_by_vector(vector, 5, include_distances=False) for i in range(0,len(nn_ent2)): df_negative_examples = df_negative_examples.append({'entity_id_wiki_1': dict_wiki_1[i], 'entity_id_wiki_2': entity_2, 'vector_entity_1': t_wiki_1.get_item_vector(i) ,'vector_entity_2': vector, 'label':0}, ignore_index=True) return df_negative_examples
class DatasetCollector: """Класс для создания тренировочных и проверочных данных""" DATASET_R_KEY_EX = 60 * 60 * 24 * 5 DATASET_R_KEY = 'dataset:{}' DATASET_ALL_R_KEY = 'dataset:{}:all' DATASET_START_R_KEY = 'dataset:{}:start' def __init__(self, dataset_model: Dataset): self.dataset_model = dataset_model self.dataset_dir = dataset_model.path self.annoy_index = None def create_doctor_item_base_matrix(self, save: bool = True ) -> Tuple[pd.DataFrame, AnnoyIndex]: """Создание item base матрицы врачей и сохранение в индексе annoy и csv""" data = pd.DataFrame.from_records( Doctor.query.order_by(Doctor.id).all()) ids = data.iloc[:, 0] # Производим нормализацию (MinMaxScaler переносит все точки на отрезок (0, 1)) features = pd.DataFrame.from_records(MinMaxScaler().fit_transform( data.iloc[:, 3:])) matrix_data = pd.concat([ids, features], axis=1) self.annoy_index = AnnoyIndex(AnnoySettings.ITEMS, AnnoySettings.METRIC) for doc_id, doc_feature in zip(ids.values, features.values): self.annoy_index.add_item(doc_id, doc_feature) self.annoy_index.build(AnnoySettings.TREES, AnnoySettings.JOBS) if save: matrix_data.to_csv(self.get_save_path(DOCTORS_CSV), header=False, index=False) self.annoy_index.save(self.get_save_path(DOCTORS_ANN)) return matrix_data, self.annoy_index def get_save_path(self, file_name: str) -> str: return os.path.join(self.dataset_dir, file_name) @staticmethod def get_appts_by_user(user_id: int) -> List[Appointment]: """Получает список записей на прием""" appts = (Appointment.query.options( load_only( 'id', 'doctor_id', 'spec_id')).filter(Appointment.user_id == user_id).order_by( desc(Appointment.dt_created)).distinct().all()) return [appt for appt in appts] @staticmethod def get_users(min_appt=1) -> List[int]: """Получает список пользователей, у которых записей на прием не меньше, чем min_appt""" users = (Appointment.query.with_entities(Appointment.user_id).group_by( Appointment.user_id).having( func.count(Appointment.doctor_id) >= min_appt)) return [user[0] for user in users.all()] @staticmethod def get_doctor_towns() -> Dict[int, int]: """Получаем докторов и их города""" doctors = Doctor.query.with_entities(Doctor.id, Doctor.town_id).all() return {doctor_id: town_id for doctor_id, town_id in doctors} @staticmethod def get_town_doctor_list(town_id: int, spec_id: int, exclude: Tuple[int]) -> List[int]: """Получает список врачей в городе по заданной специальности исключая exclude""" doctors = (DoctorTown.query.with_entities( DoctorTown.doctor_id).filter(DoctorTown.town_id == town_id).filter( DoctorTown.wp_spec_id == spec_id).filter( ~DoctorTown.doctor_id.in_(exclude)).order_by( desc(DoctorTown.rating)).distinct()) return [doc[0] for doc in doctors.all()] def set_appt_dataset(self, to_list, doc_towns, appt) -> None: doctors = self.get_town_doctor_list(doc_towns[appt.doctor_id], appt.spec_id, exclude=(appt.doctor_id, )) for doctor in doctors[:100]: to_list.append( [0, appt.id, *self.annoy_index.get_item_vector(doctor)]) to_list.append( [1, appt.id, *self.annoy_index.get_item_vector(appt.doctor_id)]) def get_check_data(self, doc_towns, last_appt, old_appts) -> dict: """Предагрегирует данные для финального тестирования модели""" doctors = self.get_town_doctor_list(doc_towns[last_appt.doctor_id], last_appt.spec_id, exclude=tuple())[:200] return { 'selected_doctor': last_appt.doctor_id, 'suggested_doctors': doctors, 'all_appts': [appt.doctor_id for appt in old_appts], } def create_datasets_for_catboost( self, min_appts: int = 1, save: bool = True ) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]: """Создает датасет для тренировки и тестирования""" assert self.annoy_index is not None, 'annoy_index does not exist' all_users = self.get_users(min_appts) doc_towns = self.get_doctor_towns() test, train, check = [], [], [] r_con = redis_connection() dataset_r_key = self.DATASET_R_KEY.format(self.dataset_model.id) r_con.set(dataset_r_key, 0, ex=self.DATASET_R_KEY_EX) r_con.set(self.DATASET_ALL_R_KEY.format(self.dataset_model.id), len(all_users), ex=self.DATASET_R_KEY_EX) r_con.set(self.DATASET_START_R_KEY.format(self.dataset_model.id), time.time(), ex=self.DATASET_R_KEY_EX) for user in all_users: last_appt, *old_appts = self.get_appts_by_user(user) # последнюю запись на прием оставляем для финального тестирования check.append(self.get_check_data(doc_towns, last_appt, old_appts)) r_con.incr(dataset_r_key) if not old_appts: continue # была всего одна запись на прием test_appt, *train_user_appts = old_appts # предпоследнюю запись на прием оставляем для тестирования при обучении self.set_appt_dataset(test, doc_towns, test_appt) # старые записи на прием оставляем для обучения for appt in train_user_appts: self.set_appt_dataset(train, doc_towns, appt) test_df = pd.DataFrame(test) train_df = pd.DataFrame(train) if save: test_df.to_csv(self.get_save_path(TEST_DATASET), header=False, index=False) train_df.to_csv(self.get_save_path(TRAIN_DATASET), header=False, index=False) with open(self.get_save_path(CHECK_DATASET), 'w') as fp: json.dump(check, fp) return test_df, train_df, check def load_dataset(self): test_df = pd.read_csv(self.get_save_path(TEST_DATASET), header=None) train_df = pd.read_csv(self.get_save_path(TRAIN_DATASET), header=None) return test_df, train_df def load_check_dataset(self): with open(self.get_save_path(CHECK_DATASET), 'r') as fp: return json.load(fp) def load_annoy_index(self): if self.annoy_index is None: self.annoy_index = AnnoyIndex(AnnoySettings.ITEMS, AnnoySettings.METRIC) self.annoy_index.load(self.get_save_path(DOCTORS_ANN)) return self.annoy_index
print("Load pre-computed embeddings from disc") with open(embedding_cache_path, "rb") as fIn: cache_data = pickle.load(fIn) corpus_sentences = cache_data['sentences'] corpus_embeddings = cache_data['embeddings'] if not os.path.exists(annoy_index_path): # Create Annoy Index print("Create Annoy index with {} trees. This can take some time.". format(n_trees)) annoy_index = AnnoyIndex(embedding_size, 'angular') for i in range(len(corpus_embeddings)): annoy_index.add_item(i, corpus_embeddings[i]) annoy_index.build(n_trees) annoy_index.save(annoy_index_path) else: #Load Annoy Index from disc annoy_index = AnnoyIndex(embedding_size, 'angular') annoy_index.load(annoy_index_path) corpus_embeddings = torch.from_numpy(corpus_embeddings) ######### Search in the index ########### print("Corpus loaded with {} sentences / embeddings".format( len(corpus_sentences))) while True: inp_question = input("Please enter a question: ")
from annoy import AnnoyIndex import numpy as np import time start = time.time() data = np.random.randn(1000, 100).astype(np.float32) index = AnnoyIndex(100, metric="euclidean") for ind, data_ in enumerate(data): index.add_item(ind, data_) index.build(50) print(index.get_n_items()) data_1 = np.random.randn(1000, 100).astype(np.float32) for ind, data_ in enumerate(data_1): index.add_item(ind + 1000, data_) print(index.get_n_items()) index.build(50) print(index.get_n_items()) end = time.time() for ind, data_ in enumerate(data_1): index.add_item(ind + 1000, data_) print(index.get_n_items()) index.build(50) print(index.get_n_items()) print(end - start)
import spacy from utils import load_jsonl import numpy as np import annoy # import faiss from annoy import AnnoyIndex import random if False: f = 7 t = AnnoyIndex(f) # Length of item vector that will be indexed for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f) u.load('test.ann') # super fast, will just mmap the file print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors assert False nlp = spacy.load('predict/my_model') texts = [ 'Today is sunny', 'I hate bunnies',
# config dims = 2048 n_nearest_neighbors = 3 trees = 10000 infiles = glob.glob('image_vectors/*.npz') # build ann index t = AnnoyIndex(dims) for file_index, i in enumerate(infiles): file_vector = np.loadtxt(i) file_name = os.path.basename(i).split('.')[0] file_index_to_file_name[file_index] = file_name file_index_to_file_vector[file_index] = file_vector t.add_item(file_index, file_vector) t.build(trees) t.save('tree.ann') ''' # create a nearest neighbors json file for each input if not os.path.exists('nearest_neighbors'): os.makedirs('nearest_neighbors') for i in file_index_to_file_name.keys(): master_file_name = file_index_to_file_name[i] master_vector = file_index_to_file_vector[i] named_nearest_neighbors = [] nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors) for j in nearest_neighbors: neighbor_file_name = file_index_to_file_name[j]
] metadata_array, embeddings_array = list( zip(*map(load_vectors_and_metadata, file_names))) return list(itertools.chain.from_iterable(metadata_array)), np.concatenate( embeddings_array, axis=0) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--paths", nargs='+', help="folder of parsed pictures") args = parser.parse_args() metadata_array = list() embeddings_array = list() for doc_path in args.paths: meta_, vec_ = prepare_data(doc_path) metadata_array.append(meta_) embeddings_array.append(vec_) meta, vec = list( itertools.chain.from_iterable(metadata_array)), np.concatenate( embeddings_array, axis=0) f = vec.shape[1] t = AnnoyIndex(f, 'angular') # Length of item vector that will be indexed for i in range(vec.shape[0]): t.add_item(i, vec[i]) t.build(30) # 30 trees t.save('test.ann') json.dump(meta, open("metadata.json", "w"))
def generate_hashes(images, filename_single_edges, filename_corners, filename_opposite_edges, filename_three_edges, filename_four_edges): num_images = images.shape[0] #tile_index = AnnoyIndex(4 * images.shape[1], metric='euclidean') channels = 1 if len(images.shape) > 3: channels = images.shape[3] corner_index = AnnoyIndex( 2 * channels * images.shape[1], metric='euclidean') # Length of item vector that will be indexed full_filename_corners = os.path.join(os.getcwd(), filename_corners.replace('/', '\\')) single_edge_index = AnnoyIndex( channels * images.shape[1], metric='euclidean') # Length of item vector that will be indexed full_filename_edges = os.path.join( os.getcwd(), filename_single_edges.replace('/', '\\')) opposite_edges_index = AnnoyIndex( 2 * channels * images.shape[1], metric='euclidean') # Length of item vector that will be indexed full_filename_opposite_edges = os.path.join( os.getcwd(), filename_opposite_edges.replace('/', '\\')) three_edges_index = AnnoyIndex( 3 * channels * images.shape[1], metric='euclidean') # Length of item vector that will be indexed full_filename_three_edges = os.path.join( os.getcwd(), filename_three_edges.replace('/', '\\')) four_edges_index = AnnoyIndex( 4 * channels * images.shape[1], metric='euclidean') # Length of item vector that will be indexed full_filename_four_edges = os.path.join( os.getcwd(), filename_four_edges.replace('/', '\\')) identifiers = np.column_stack( (np.floor(np.arange(0, num_images, 0.25)), np.tile(range(4), (1, num_images))[0])) generate_single_edges = not os.path.isfile(full_filename_edges) generate_corners = not os.path.isfile(full_filename_corners) generate_opposite_edges = not os.path.isfile(full_filename_opposite_edges) generate_three_edges = not os.path.isfile(full_filename_three_edges) generate_four_edges = not os.path.isfile(full_filename_four_edges) if not generate_single_edges: single_edge_index.load(full_filename_edges) print('loaded single edge index') if not generate_corners: # and os.path.isfile(full_filename_tiles): corner_index.load(full_filename_corners) print('loaded corner index') if not generate_opposite_edges: opposite_edges_index.load(full_filename_opposite_edges) print('loaded opposite edge index') if not generate_three_edges: three_edges_index.load(full_filename_three_edges) print('loaded three edge index') if not generate_four_edges: four_edges_index.load(full_filename_four_edges) print('loaded four edge index') if not generate_corners and not generate_single_edges and not generate_opposite_edges and not generate_three_edges and not generate_four_edges: print('found all indices, returning...') return single_edge_index, corner_index, opposite_edges_index, three_edges_index, four_edges_index, identifiers #if all are already loaded from file, no generation needed - return from here ct = 0 for idx, image in enumerate(tqdm(images)): (top, right, bottom, left) = get_all_edges_from_array(image) if generate_single_edges: single_edge_index.add_item(ct, top) single_edge_index.add_item(ct + 1, right) single_edge_index.add_item(ct + 2, bottom) single_edge_index.add_item(ct + 3, left) if generate_corners: corner_left_top = np.concatenate([left, top]) corner_top_right = np.concatenate([top, right]) corner_right_bottom = np.concatenate([right, bottom]) corner_bottom_left = np.concatenate([bottom, left]) corner_index.add_item(ct, corner_left_top) corner_index.add_item(ct + 1, corner_top_right) corner_index.add_item(ct + 2, corner_right_bottom) corner_index.add_item(ct + 3, corner_bottom_left) if generate_opposite_edges: opposite_left_right = np.concatenate([left, right]) opposite_top_bottom = np.concatenate([top, bottom]) opposite_right_left = np.concatenate([right, left]) opposite_bottom_top = np.concatenate([bottom, top]) opposite_edges_index.add_item(ct, opposite_left_right) opposite_edges_index.add_item(ct + 1, opposite_top_bottom) opposite_edges_index.add_item(ct + 2, opposite_right_left) opposite_edges_index.add_item(ct + 3, opposite_bottom_top) if generate_three_edges: three_without_top = np.concatenate([right, bottom, left]) three_without_right = np.concatenate([bottom, left, top]) three_without_bottom = np.concatenate([left, top, right]) three_without_left = np.concatenate([top, right, bottom]) three_edges_index.add_item(ct, three_without_top) three_edges_index.add_item(ct + 1, three_without_right) three_edges_index.add_item(ct + 2, three_without_bottom) three_edges_index.add_item(ct + 3, three_without_left) if generate_four_edges: tile_edge_top = np.concatenate([top, right, bottom, left]) tile_edge_right = np.concatenate([right, bottom, left, top]) tile_edge_bottom = np.concatenate([bottom, left, top, right]) tile_edge_left = np.concatenate([left, top, right, bottom]) four_edges_index.add_item(ct, tile_edge_top) four_edges_index.add_item(ct + 1, tile_edge_right) four_edges_index.add_item(ct + 2, tile_edge_bottom) four_edges_index.add_item(ct + 3, tile_edge_left) ct += 4 if generate_single_edges: single_edge_index.build(10) # 10 trees single_edge_index.save(filename_single_edges) print('generated and saved single edges index') if generate_corners: corner_index.build(10) # 10 trees corner_index.save(filename_corners) print('generated and saved corner index') if generate_opposite_edges: opposite_edges_index.build(10) # 10 trees opposite_edges_index.save(filename_opposite_edges) print('generated and saved opposite edges index') if generate_three_edges: three_edges_index.build(10) # 10 trees three_edges_index.save(filename_three_edges) print('generated and saved three edges index') if generate_four_edges: four_edges_index.build(10) # 10 trees four_edges_index.save(filename_four_edges) print('generated and saved four edges index') return single_edge_index, corner_index, opposite_edges_index, three_edges_index, four_edges_index, identifiers
class Indexer: def __init__(self, dim, repository='', metric='angular', index_name='index.ann', db_name='names.bin', ntrees=500): self.metric = metric # angular or euclidean self.dim = dim # dimension of the indexed feature vectors self.repository = repository self.index_name = self.repository + '/' + index_name self.db_name = self.repository + '/' + db_name self.ntrees = ntrees self.t = AnnoyIndex(dim, metric) self.s = shelve.open(self.db_name) self.s['dim'] = self.dim self.s['metric'] = self.metric self.sm = {} # in memory def __enter__(self): return self def __exit__( self, exc_type, exc_value, traceback ): # to be used with the 'with Indexer(...) as indexer:' statement if len(self.sm) > 0: for k, v in self.sm.iteritems(): self.s[k] = v self.s.close() def index_single(self, c, feature_vector, uri): self.t.add_item(c, feature_vector) self.s[str(c)] = uri def index(self, feature_vectors, uris): c = 0 for f in feature_vectors: self.t.add_item(c, f) self.s[str(c)] = uris[ c] # uris contains the image uri and possibly more complicated structures c = c + 1 self.build_index() self.save_index() def index_tags_single(self, tags, uri): for t in tags: #print t['cat'],t['prob'] cat_key = str(t['cat']) if not cat_key in self.sm: self.sm[cat_key] = [{'uri': uri, 'prob': t['prob']}] else: temp = self.sm[cat_key] temp.append({'uri': uri, 'prob': t['prob']}) self.sm[cat_key] = temp def build_index(self): logger.info('building index in ' + self.repository) self.t.build(self.ntrees) def save_index(self): logger.info('saving index into ' + self.index_name) self.t.save(self.index_name)
class JerkAgent(threading.Thread): def __init__(self, env, solutions=[]): threading.Thread.__init__(self) self.env = TrackedEnv(env) self.solutions = solutions #self.history = pickle.load(open('./history.pkl', 'rb')) self.history = [] self.annoy_index = None #AnnoyIndex(512) #self.annoy_index.load('./test.ann') self.recorded_episode_count = 0 #self.replay_buffer = PrioritizedReplayBuffer(1000000, 0.5, 0.4, epsilon=0.1) def run(self): self.train() def should_use_history(self, reward, env): reward_percentage = reward / MAX_SCORE the_end_is_nigh = (EXPLOIT_BIAS + env.total_steps_ever / TOTAL_TIMESTEPS)**3 return random.random() < np.mean([reward_percentage, the_end_is_nigh]) def best_solution(self): best_pair = sorted(self.solutions, key=lambda x: np.mean(x[0]))[-1] reward = np.mean(best_pair[0]) return best_pair, reward def train(self): """Run JERK on the attached environment.""" new_ep = True best_reward = 0. keep_percentage = 0.6 best_pair = None config = tf.ConfigProto() config.gpu_options.allow_growth = True # pylint: disable=E1101 with tf.Session(config=config): self.session = tf.get_default_session() self.model = policies.CnnPolicy(self.session, self.env.observation_space, self.env.action_space, 1, 1) self.a0 = self.model.pd.sample() params = tf.trainable_variables() #print('params', params) #for i in tf.get_default_graph().get_operations(): # print(i.name) #self.output_layer = tf.get_default_graph().get_tensor_by_name('model/fc1/add:0') self.output_layer = tf.get_default_graph().get_tensor_by_name( 'model/Relu_3:0') #load_path = '/root/compo/saved_weights.joblib' load_path = './saved_weights.joblib' loaded_params = joblib.load(load_path) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) self.session.run(restores) print('model created') while True: if new_ep: if len(self.solutions) > 0: best_pair, best_reward = self.best_solution() if self.solutions and self.should_use_history( best_reward, self.env): new_rew, last_reward_index = self.exploit( self.env, best_pair[1]) best_pair[0].append(new_rew) if (new_rew / best_reward > keep_percentage) and len( self.env.best_sequence()) != len(best_pair[1]): self.solutions.append(([ max(self.env.reward_history) ], self.env.best_sequence()[0:last_reward_index])) self.record_history() print('replayed best with reward %f' % (new_rew * 100)) continue elif best_pair: mutation_rate = ( 1 - (best_reward / MAX_SCORE)) * MUTATION_DAMPEN mutated = self.mutate(best_pair[1], mutation_rate) new_rew, last_reward_index = self.exploit( self.env, mutated) print('mutated solution rewarded %f vs %f' % ((new_rew * 100), (best_reward * 100))) if (new_rew / best_reward > keep_percentage): self.solutions.append(([ max(self.env.reward_history) ], self.env.best_sequence()[0:last_reward_index])) self.record_history() continue else: self.env.reset() new_ep = False rew, new_ep = self.move(self.env, 100) if not new_ep and rew <= 0: print('backtracking due to negative reward: %f' % (rew * 100)) _, new_ep = self.move(self.env, 70, left=True) if new_ep: print('Episode rewarded %f vs %f' % ((rew * 100), (best_reward * 100))) self.record_history() self.solutions.append(([max(self.env.reward_history)], self.env.best_sequence())) def record_history(self): self.recorded_episode_count += 1 op = self.output_layer embeddings = [] for i in range(len(self.env.reward_history)): obs = self.env.obs_history[ i] #(i * batch_size):((i + 1) * batch_size)] embedding = self.session.run([op], {self.model.X: [obs]})[0].reshape( 512, ) reward = self.env.reward_history[i] action = self.env.action_history[i] #print('obs', np.array(obs).shape) #self.replay_buffer.add_sample({ # 'obs': obs, # 'actions': [action], # 'rewards': [reward], # 'new_obs': i == len(self.env.reward_history) # }) if reward > 0 and action[6]: self.history.append((embedding, reward, action)) save_interval = 2 if self.recorded_episode_count % save_interval == 0: print('recording history') #pickle.dump(self.history, open('./history.pkl', 'wb')) self.annoy_index = AnnoyIndex(512) for i in range(len(self.history)): self.annoy_index.add_item(i, self.history[i][0]) self.annoy_index.build(20) #pickle.dump(self.replay_buffer, open( "./replay_buffer.p", "wb" )) def mutate(self, sequence, mutation_rate): mutated = copy.deepcopy(sequence) sequence_length = len(sequence) mutation_count = 0 #mutation_start_index = min(sequence_length, random.randint(100, 2000)) if random.random() < sequence_length / 100.: deletion_index = random.randint(0, sequence_length - 1) deletion_length = random.randint(0, sequence_length // 5) del mutated[deletion_index:(deletion_index + deletion_length)] print('excised %d of %d actions' % (deletion_length, sequence_length)) trim_length = random.randint(0, sequence_length // 5) del mutated[-trim_length:] print('trimmed %d of %d actions' % (trim_length, sequence_length)) mutation_start_index = len(mutated) for i, action in reversed( list(enumerate(sequence[0:mutation_start_index]))): #percent_distance = i + 1 / sequence_length exponent = -(mutation_start_index - i - 1) / 1e2 if random.random() < np.exp(exponent) * mutation_rate: #mutated = mutated[0:i] #print('trimmed %d of %d actions' % (mutation_start_index - len(mutated), sequence_length)) #return mutated mutated[i] = random.choice(ACTIONS).copy() mutation_count += 1 print('mutated %d out of %d actions' % (mutation_count, sequence_length)) return mutated def random_next_step(self, left=False, jump_prob=1.0 / 10.0, jump_repeat=4, jumping_steps_left=0): action = random.choice(ACTIONS).copy() #np.zeros((12,), dtype=np.bool) action[6] = left action[7] = not left if jumping_steps_left > 0: action[0] = True jumping_steps_left -= 1 else: if random.random() < jump_prob: jumping_steps_left = jump_repeat - 1 action[0] = True return action, jumping_steps_left def move(self, env, num_steps, left=False, jump_prob=1.0 / 10.0, jump_repeat=4): """ Move right or left for a certain number of steps, jumping periodically. """ #start_time = time.clock() total_rew = 0.0 done = False steps_taken = 0 jumping_steps_left = 0 #random_prob = 0.5 use_memory = random.random() > 0.5 use_model = random.random() > 0.5 times = {} while not done and steps_taken < num_steps: if self.model \ and self.annoy_index is not None \ and len(self.env.obs_history) > 0 \ and not left \ and use_memory \ and self.recorded_episode_count > 5: #print('sample', time.clock() - start_time) ob = [self.env.obs_history[-1]] embedding = self.session.run([self.output_layer], {self.model.X: ob})[0].reshape( 512, ) results = self.annoy_index.get_nns_by_vector( embedding, 100, include_distances=True) items = [self.history[i] for i in results[0]] rewards = [item[1] for item in items] #action = self.history[results[0][np.argmax(np.multiply(rewards, np.divide(1, results[1] + 1e9)))]][2] if len(rewards) > 0: action = self.history[results[0][np.argmax(rewards)]][2] else: action, jumping_steps_left = self.random_next_step( left, jump_prob, jump_repeat, jumping_steps_left) #print(action, 'memory') _, rew, done, _ = env.step(action) #print('step', time.clock() - start_time) elif self.model \ and len(self.env.obs_history) > 0 \ and not left \ and use_model: ob = [self.env.obs_history[-1]] actions = self.session.run([self.a0], {self.model.X: ob}) action = ACTIONS[actions[0][0]].copy() #print(action, 'model') _, rew, done, _ = env.step(action) else: action, jumping_steps_left = self.random_next_step( left, jump_prob, jump_repeat, jumping_steps_left) #print(action, 'random') _, rew, done, _ = env.step(action) total_rew += rew steps_taken += 1 if done: break #print('time to move {} steps'.format(steps_taken), time.clock() - start_time) return total_rew, done def exploit(self, env, sequence): """ Replay an action sequence; pad with NOPs if needed. Returns the final cumulative reward. """ env.reset() done = False idx = 0 total_reward = 0 jumping_steps_left = 0 left = False last_reward_index = 0 while not done: if idx >= len(sequence) or idx - last_reward_index > 100: while not done: steps = 100 reward, done = self.move(env, steps, left) idx += steps if left: left = False if reward == 0: left = True else: last_reward_index = idx else: action = sequence[idx] _, reward, done, info = env.step(action) total_reward += reward if reward > 0: last_reward_index = idx #_, _, done, _ = env.step(action) idx += 1 return env.total_reward, last_reward_index
class W2V_ANN(Model): def __init__(self, config): self.requirement = [ 'test_file', 'lastN', 'topN', 'type', 'item_vec_file', 'index_file_file' ] self.config = config miss = set() for item in self.requirement: if item not in self.config: miss.add(item) if len(miss) > 0: raise Exception(f"Miss the key : {miss}") Model.__init__(self, self.config['test_file'], self.config['lastN'], self.config['topN']) self.type = config['type'] # behavior / item def train(self): b_time = time.time() self.item_idx = {} self.item_idx_reverse = {} with open(self.config['item_vec_file'], 'r') as in_f: num_items, dim = in_f.readline().strip().split() print(f'Num of items : {num_items}, dim : {dim}') self.t = AnnoyIndex(int(dim), 'angular') for idx, line in tqdm(enumerate(in_f)): tmp = line.split() self.item_idx[tmp[0]] = idx self.item_idx_reverse[idx] = tmp[0] self.t.add_item(idx, list(map(float, tmp[1:]))) print("Read file finished ...") file_name = self.config['index_file_file'] + '.' + self.type self.t.build(30) # 10 trees self.t.save(f'{file_name}.ann') # self.t.load(f'{file_name}.ann') print(f"Train finished ...{time.time() - b_time}") def predict(self, last_n_events, topN): b_time = time.time() candidate_set = set() if self.type == 'item': last_n_items = [ self.item_idx[e.split(':', 1)[1]] for e in last_n_events[::-1] if e in self.item_idx ] else: last_n_items = [ self.item_idx[e] for e in last_n_events[::-1] if e in self.item_idx ] if len(last_n_items) == 0: return [] rank_weight = np.array( [1 / np.log2(rank + 2) for rank in range(len(last_n_items))]) # Calculate session vector session_vec = np.mean([ np.array(self.t.get_item_vector(e)) * rank_weight[idx] for idx, e in enumerate(last_n_items) ], axis=0) r_items, r_scores = self.t.get_nns_by_vector(session_vec, topN * 2, include_distances=True) res = [] for item in r_items: if item in last_n_items: continue try: if self.type == 'item': item_raw = self.item_idx_reverse[item] else: item_raw = self.item_idx_reverse[item].split(':', 1)[1] if item_raw in res: continue res.append(item_raw) except: pass if len(res) == topN: break return res
class Annoy: def __init__(self): self.dim = 300 self.sim_metric = 'angular' self.n_trees = 10 self.search_k = 1 self.modelLoaded = False # self.loadModelFromDisk(model_location) def initAnnoy(self, dim, metric, matrix): self.sim_metric = metric self.dim = dim print('Annoy init index') self.a_index = AnnoyIndex(self.dim, self.sim_metric) build_ = self.a_index.build(self.n_trees) #if build_: # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index) return build_ #self.modelLoaded def addVectors(self, documents): ids = [] # unbuild annoy index before adding new data self.a_index.unbuild() # add vectors for document in documents: _id = document._id vec = document.vector ids.append(_id) vector_e = vec.e vector_e_l = len(vector_e) # check if the vector length is below dimention limit # then pad vector with 0 by dimension if vector_e_l < self.dim: vector_e.extend([0]*(self.dim-vector_e_l)) # make sure vector length doesn't exceed dimension limit vector_e = vector_e[:self.dim] # add vector self.a_index.add_item(int(_id), vector_e) # build vector build_ = self.a_index.build(self.n_trees) # if build_: # self.modelLoaded = self.saveModelToDisk(model_location, self.a_index) return build_, ids def deleteVectors(self, ids): return True, ids def getNearest(self, matrix, k): ids = [] dists = [] for vec_data in matrix: _id, _dist = self.a_index.get_nns_by_vector(vec_data, k, search_k=self.search_k, include_distances=True) ids.append(_id) dists.append(_dist) return True, ids, dists def loadModelFromDisk(self, location): try: # read index self.a_index = AnnoyIndex(self.dim, self.sim_metric) self.a_index.load(location) print('Annoy index loading success') return True except: print('Annoy index loading failed') return False def saveModelToDisk(self, location, index): try: # write index index.save(location) print('Annoy index writing success') return True except: print('Annoy index writing failed') return False
class NearSentence(object): def __init__(self, fn_word, model_name, model_path): self.model = QueryModel(fn_word, model_name, model_path) self.queries = [] self.titles = [] self.query_index = 0 self.title_index = 0 self.query_ann = AnnoyIndex(self.model.dim, metric='euclidean') self.title_ann = AnnoyIndex(self.model.dim, metric='euclidean') def load_queries(self, fn_query, column): print '[In load_queries] Load candidate queries' sentences = [] chunk = [] vecs = [] with open(fn_query) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) < column: continue chunk.append(ll[column - 1]) if len(chunk) == 1000: vec, valid_sentence = self.model.get_query_vec(chunk) vec = vec / np.sqrt(np.sum(vec**2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) chunk = [] if len(chunk) > 0: vec, valid_sentence = self.model.get_query_vec(chunk) vecs.extend(list(vec)) sentences.extend(valid_sentence) print '[In load_queries] Build query annoy tree' for s, v in izip(sentences, vecs): self.queries.append(s) # if vecs == [0] * self.vectorizer.dim: # continue self.query_ann.add_item(self.query_index, v) self.query_index += 1 self.query_ann.build(10) print '[In load_queries] Size of tree =', self.query_ann.get_n_items() def load_titles(self, fn_title, column): print '[In load_titles] Load candidate titles' sentences = [] chunk = [] vecs = [] with open(fn_title) as fin: for line in fin: ll = line.decode('utf8').strip().split('\t') if len(ll) < column: continue chunk.append(ll[column - 1]) if len(chunk) == 1000: vec, valid_sentence = self.model.get_title_vec(chunk) vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) chunk = [] if len(chunk) > 0: vec, valid_sentence = self.model.get_title_vec(chunk) vec = vec / np.sqrt(np.sum(vec ** 2, 1, keepdims=True)) vecs.extend(list(vec)) sentences.extend(valid_sentence) print '[In load_titles] Build titles annoy tree, size =', len(vecs) for s, v in izip(sentences, vecs): self.titles.append(s) self.title_ann.add_item(self.title_index, v) # v is a list self.title_index += 1 self.title_ann.build(10) print '[In load_titles] Size of tree =', self.title_ann.get_n_items() def get_k_nearest_query(self, query, k): if isinstance(query, unicode): query = query.encode('utf8') cut_data = text_cutter.process({'title': query}) cut_query = cut_data['cut_title'].decode('utf8') vecs, valid_queries= self.model.get_query_vec([cut_query]) if len(valid_queries) == 0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] k_neighbors, scores = self.query_ann.get_nns_by_vector(vec, n=k, include_distances=True) neighbors = [] for i in k_neighbors: neighbors.append(self.queries[i]) return sorted(zip(neighbors, scores), key=lambda x: x[-1]) # def sim(self, u, v): # norm_u = u / np.sqrt(np.sum(u ** 2, keepdims=True)) # norm_v = u /np.sqrt(np.sum(v ** 2, keepdims=True)) # return np.dot(norm_u, norm_v) def get_k_nearest_title(self, title, k): if isinstance(title, unicode): title = title.encode('utf8') cut_data = text_cutter.process({'title': title}) title = cut_data['cut_title'].decode('utf8') vecs, valid_titles = self.model.get_title_vec([title]) if len(valid_titles) == 0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] k_neighbors, scores = self.title_ann.get_nns_by_vector(vec, n=k, include_distances=True) neighbors = [] for i in k_neighbors: neighbors.append(self.titles[i]) return sorted(zip(neighbors, scores), key=lambda x: x[-1]) def get_answers(self, query, k): if isinstance(query, unicode): query = query.encode('utf8') cut_data = text_cutter.process({'title': query}) cut_query = cut_data['cut_title'].decode('utf8') vecs, valid_queries = self.model.get_query_vec([cut_query]) if len(valid_queries)==0: return [] vecs = vecs / np.sqrt(np.sum(vecs ** 2, 1, keepdims=True)) vec = list(vecs)[0] # recall titles according to cosine similarity candidate_titles_index, scores = self.title_ann.get_nns_by_vector(vec, n=k*10, include_distances=True) # rank candidate titles using model candidate_titles = [] for i in candidate_titles_index: candidate_titles.append(self.titles[i]) ranks = self.model.rank_titles(cut_query, candidate_titles)[:k] return ranks def process(self, data): res = {} if 'titles' in data: res['title_nns'] = self.get_k_nearest_title(data['titles'], 10) if 'queries' in data: res['query_nns'] = self.get_k_nearest_query(data['queries'], 10) return json.dumps(res, ensure_ascii=False).encode('utf8')
def evaluate_predictions(self,id_to_latent_factor_dict,userid_to_latent_factor_dict,spectrogram_dir,num_users,id_to_songname_dict): #evaluate latent factor predictions get_song_embeddings = Model(inputs=self.model.input,outputs=self.model.get_layer(index=13).output) with open('Metadata\\song_id_to_prediction.txt') as f: song_id_to_prediction = json.loads(f.read()) with open('Metadata\\user_id_to_prediction.txt') as f: user_id_to_prediction = json.loads(f.read()) song_ids,song_predictions = zip(*(song_id_to_prediction.items())) #get song vectors generated by the cnn and the corresponding song ids user_ids,user_predictions = zip(*(user_id_to_prediction.items())) #get user vectors generated by the cnn and the corresponding user ids print('Predictions for {} songs in database created.....'.format(len(song_ids))) print('Predictions for {} users in database created.....'.format(len(user_ids))) new_subset_songs = [] accuracy = [] # list containing recommendation accuracy for each user mAP = [] # list for containing mAP for each user song_ids_actual = [] #list of songs song_latent_factors = [] #get latent factors for every song in the dataset for song in song_ids: try: song_latent_factors.append(id_to_latent_factor_dict[song]) except KeyError: pass #build vector space with predicted latent factors t_pred_space = AnnoyIndex(self.num_factors,'angular') for i in range(len(song_predictions)): t_pred_space.add_item(i,song_predictions[i]) t_pred_space.build(10) #build vector space with actual latent factors t_latent_space = AnnoyIndex(self.num_factors,'angular') for i in range(len(song_latent_factors)): t_latent_space.add_item(i,song_latent_factors[i]) t_latent_space.build(10) user_count = 0 for i in range(len(user_ids)): closest_songs_predicted = [] closest_songs_actual = [] user_id = user_ids[i] user_count+=1 closest_songs = t_pred_space.get_nns_by_vector(user_predictions[i],500,include_distances = False) #get 500 closest songs to each user vector for index in closest_songs: try: closest_songs_predicted.append(id_to_songname_dict[song_ids[index]]) except KeyError: pass #get the songids of the closest songs to the given user vector print("Closest songs generated by our network for user number {} ({}) is:".format(user_count,user_id)) print(closest_songs_predicted[0:50]) print('\n============================================\n') closest_songs_latent_space = t_latent_space.get_nns_by_vector(userid_to_latent_factor_dict[user_id],500,include_distances=False) for index in closest_songs_latent_space: try: closest_songs_actual.append(id_to_songname_dict[song_ids[index]]) except KeyError: pass print("Closest songs generated by latent factors for user number {} is:".format(user_count)) print(closest_songs_actual[0:50]) print('\n') map_user = ml_metrics.mapk(closest_songs_actual,closest_songs_predicted,k=500) good_recom = set(closest_songs_actual) & set(closest_songs_predicted) good_recom_count = len(good_recom) accuracy_user = (good_recom_count/500)*100 #print("The accuracy for this user is {} ".format(accuracy_user)) print("The map for this user is {} ".format(map_user)) print('\n') accuracy.append(accuracy_user) mAP.append(map_user) total_accuracy = sum(accuracy)/user_count total_map = sum(mAP)/user_count print('\n The mAP of the recommendations for {} users is {} '.format(user_count,total_map)) print('The accuracy of the recommendations for {} users is {}%'.format(user_count,total_accuracy)) return total_accuracy
def random_nn_trees(X, num_trees): t = AnnoyIndex(X.shape[1], 'euclidean') for i in range(X.shape[0]): t.add_item(i, X[i, :]) t.build(num_trees) return t
class Embed(object): def __init__(self, data_path, model='w2v', num_walks=100, walk_length=10): ''' data: a dataframe: user, item model: 'w2v','deepwalk','gcn', 'gat' ''' self.le = preprocessing.LabelEncoder() self.data, self.le = convert_to(data_path, self.le) self.w2v_model = None self._annoy = None self._embeddings = {} self.model_type = model self.num_walks = num_walks self.walk_length = walk_length if self.model_type == 'w2v': self.sentences = generate_sentences(self.data) if self.model_type == 'deepwalk': self.sentences = generate_sentences_dw(data) if self.model_type == 'gat': pass def train(self, window_size=5, workers=3, iter=5, learning_rate=0.01, epochs=10, dimensions=128, num_of_walks=80, beta=0.5, gamma=0.5, **kwargs): self.workers = workers self.iter = iter self.window_size = window_size self.learning_rate = learning_rate self.epochs = epochs self.dimensions = dimensions self.num_of_walks = num_of_walks self.beta = beta self.gamma = gamma self._annoy = AnnoyIndex(dimensions, 'angular') if self.model_type == 'w2v' or self.model_type == 'deepwalk': kwargs["sentences"] = self.sentences kwargs["min_count"] = kwargs.get("min_count", 0) kwargs["size"] = self.dimensions kwargs["sg"] = 1 # skip gram kwargs["hs"] = 1 # deepwalk use Hierarchical Softmax kwargs["workers"] = self.workers kwargs["window"] = self.window_size kwargs["iter"] = self.iter print(f"There are {self.data.user.nunique()} users") print(f"There are {self.data.item.nunique()} items") print("Learning embedding vectors...") model_w2v = Word2Vec(**kwargs) print("Learning embedding vectors done!") self.w2v_model = model_w2v words = self.data['user'].unique().tolist( ) + self.data['item'].unique().tolist() for word in words: self._annoy.add_item( self.le.transform([word])[0], self.w2v_model.wv[word]) self._annoy.build(-1) if self.model_type == 'gat': model = AttentionWalkTrainer(graph_path=self.data, dimensions=self.dimensions, learning_rate=self.learning_rate, epochs=self.epochs, window_size=self.window_size, num_of_walks=self.num_of_walks, beta=self.beta, gamma=self.gamma) model.fit() emb = model.save_embedding() for id in emb.id: self._annoy.add_item(int(id), emb[emb.id == id].values.tolist()[0][1:]) self._annoy.build(-1) #return model_w2v # def get_embeddings(self,): # if self.w2v_model is None: # print("model not train") # return {} # # self._embeddings = {} # words = self.data['user'].unique().tolist() + self.data['item'].unique().tolist() # for word in words: # self._embeddings[word] = self.w2v_model.wv[word] # # return self._embeddings def search(self, seed, k=5, type=None): ''' seed: seed item to find nearest neighbor k: number of cloest neighhbors ''' a_return = self._annoy.get_nns_by_item( int(self.le.transform([seed])[0]), k) return list(self.le.inverse_transform(a_return))
def convert(input_file_path, output_file_path=None, precision=DEFAULT_PRECISION, subword=False, subword_start=DEFAULT_NGRAM_BEG, subword_end=DEFAULT_NGRAM_END, approx=False, approx_trees=None, vocab_path=None, unicode_errors='strict'): files_to_remove = [] subword = int(subword) approx = int(approx) # If no output_file_path specified, create it in a tempdir if output_file_path is None: output_file_path = os.path.join( tempfile.mkdtemp(), fast_md5_file(input_file_path) + '.magnitude') if os.path.isfile(output_file_path): try: conn = sqlite3.connect(output_file_path) db = conn.cursor() db.execute( "SELECT value FROM magnitude_format WHERE key='size'") \ .fetchall()[0][0] conn.close() # File already exists and is functioning return output_file_path except BaseException: pass # Check args meta_1_path = None meta_2_path = None input_is_text = input_file_path.endswith('.txt') or \ input_file_path.endswith('.vec') input_is_binary = input_file_path.endswith('.bin') input_is_hdf5 = input_file_path.endswith('.hdf5') input_is_hdf5_weights = input_file_path.endswith('_weights.hdf5') if not input_is_text and not input_is_binary and not input_is_hdf5: exit("The input file path must be `.txt`, `.bin`, `.vec`, or `.hdf5`") if not output_file_path.endswith('.magnitude'): exit("The output file path file path must be `.magnitude`") if vocab_path and not vocab_path.endswith(".magnitude"): exit("The vocab file path file path must be `.magnitude`") # Detect ELMo and ELMo options file input_is_elmo = False elmo_options_path = None if input_is_hdf5: elmo_options_path = input_file_path[0:-13] + \ '_options.json' if input_is_hdf5_weights else input_file_path[0:-5] + '.json' # noqa if not os.path.isfile(elmo_options_path): exit( "Expected `" + elmo_options_path + "` to exist. ELMo models require a JSON options file.") input_is_elmo = True meta_1_path = input_file_path meta_2_path = elmo_options_path # Detect GloVe format and convert to word2vec if detected detected_glove = False if input_is_text: with io.open(input_file_path, mode="r", encoding="utf-8", errors="ignore") as ifp: line1 = None line2 = None while line1 is None or line2 is None: line = ifp.readline().strip() if len(line) > 0: if line1 is None: line1 = line elif line2 is None: line2 = line line1 = line1.replace('\t', ' ') line2 = line2.replace('\t', ' ') line1 = line1.split() line2 = line2.split() if len(line1) == len(line2): # No header line present detected_glove = True if detected_glove: eprint("Detected GloVe format! Converting to word2vec format first..." "(this may take some time)") temp_file_path = os.path.join( tempfile.mkdtemp(), os.path.basename(input_file_path) + '.txt') try: import gensim except ImportError: raise ImportError("You need gensim >= 3.3.0 installed with pip \ (`pip install gensim`) to convert GloVe files.") gensim.scripts.glove2word2vec.glove2word2vec( input_file_path, temp_file_path ) input_file_path = temp_file_path files_to_remove.append(temp_file_path) # Open and load vector file eprint("Loading vectors... (this may take some time)") number_of_keys = None dimensions = None if input_is_binary: try: from gensim.models import KeyedVectors except ImportError: raise ImportError("You need gensim >= 3.3.0 installed with pip \ (`pip install gensim`) to convert binary files.") keyed_vectors = KeyedVectors.load_word2vec_format( input_file_path, binary=input_is_binary, unicode_errors=unicode_errors) number_of_keys = len(keyed_vectors.vectors) dimensions = len(keyed_vectors.vectors[0]) elif input_is_text: # Read it manually instead of with gensim so we can stream large models class KeyedVectors: pass def keyed_vectors_generator(): number_of_keys, dimensions = (None, None) f = io.open(input_file_path, mode="r", encoding="utf-8", errors="ignore") first_line = True for line in f: line_split = line.strip().replace('\t', ' ').split() if len(line_split) == 0: continue if first_line: first_line = False number_of_keys = int(line_split[0]) dimensions = int(line_split[1]) yield (number_of_keys, dimensions) else: empty_key = len(line_split) == dimensions vec_floats = line_split if empty_key else line_split[1:] key = "" if empty_key else line_split[0] if len(vec_floats) > dimensions: key = " ".join( [key] + vec_floats[0:len(vec_floats) - dimensions]) vec_floats = vec_floats[len(vec_floats) - dimensions:] vector = np.asarray([float(elem) for elem in vec_floats]) yield (key, vector) keyed_vectors = KeyedVectors() kv_gen = keyed_vectors_generator() number_of_keys, dimensions = next(kv_gen) kv_gen_1, kv_gen_2 = tee(kv_gen) keyed_vectors.vectors = imap(lambda kv: kv[1], kv_gen_1) keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2) elif input_is_elmo: vocab_magnitude = None if vocab_path: vocab_magnitude = Magnitude(vocab_path, eager=False, lazy_loading=1) else: vocab_magnitude = FeaturizerMagnitude(100) class KeyedVectors: pass elmo = ElmoEmbedder(elmo_options_path, input_file_path) keyed_vectors = KeyedVectors() number_of_keys = len(vocab_magnitude) dimensions = np.concatenate(elmo.embed_batch( [["test"]])[0], axis=1).flatten().shape[0] kv_gen_1, kv_gen_2 = tee(vocab_magnitude) keyed_vectors.vectors = chain.from_iterable( imap( lambda b: imap( lambda e: np.concatenate( e, axis=1).flatten(), elmo.embed_batch( list( imap( lambda k: [k], b)))), ibatch( imap( lambda kv: kv[0], kv_gen_1), 1000))) keyed_vectors.index2word = imap(lambda kv: kv[0], kv_gen_2) else: class KeyedVectors: pass keyed_vectors = KeyedVectors() number_of_keys = 0 dimensions = 0 keyed_vectors.vectors = [] keyed_vectors.index2word = [] eprint("Found %d key(s)" % number_of_keys) eprint("Each vector has %d dimension(s)" % dimensions) # Delete files if they exist try_deleting(output_file_path) try_deleting(output_file_path + "-shm") try_deleting(output_file_path + "-wal") # Temporarily re-direct the output to a tmp file output_file_path_tmp = output_file_path + '.tmp' output_file_path_orig = output_file_path output_file_path = output_file_path_tmp # Delete files if they exist try_deleting(output_file_path) try_deleting(output_file_path + "-shm") try_deleting(output_file_path + "-wal") # Connect to magnitude datastore conn = sqlite3.connect(output_file_path) db = conn.cursor() # Make the database fast conn.isolation_level = None db.execute("PRAGMA synchronous = OFF;") db.execute("PRAGMA default_synchronous = OFF;") db.execute("PRAGMA journal_mode = WAL;") db.execute("PRAGMA count_changes = OFF;") # Create table structure eprint("Creating magnitude format...") db.execute("DROP TABLE IF EXISTS `magnitude`;") db.execute(""" CREATE TABLE `magnitude` ( key TEXT COLLATE NOCASE, """ + ",\n".join([("dim_%d INTEGER" % i) for i in range(dimensions)]) + ",\nmagnitude REAL" + """ ); """) db.execute(""" CREATE TABLE `magnitude_format` ( key TEXT COLLATE NOCASE, value INTEGER ); """) if subword: db.execute(""" CREATE VIRTUAL TABLE `magnitude_subword` USING fts3( char_ngrams, num_ngrams ); """) if approx: db.execute(""" CREATE TABLE `magnitude_approx` ( trees INTEGER, index_file BLOB ); """) metas = [('meta_1', meta_1_path), ('meta_2', meta_2_path)] for meta_name, meta_path in metas: if meta_path: db.execute(""" CREATE TABLE `magnitude_""" + meta_name + """` ( meta_file BLOB ); """) # Create annoy index approx_index = None if approx: approx_index = AnnoyIndex(dimensions) # Write vectors eprint("Writing vectors... (this may take some time)") insert_query = """ INSERT INTO `magnitude`( key, """ + \ ",\n".join([("dim_%d" % i) for i in range(dimensions)]) + \ ",\nmagnitude" \ + """) VALUES ( """ + \ (",\n".join(["?"] * (dimensions + 2))) \ + """ ); """ insert_subword_query = """ INSERT INTO `magnitude_subword`( char_ngrams, num_ngrams ) VALUES ( ?, ? ); """ counters = [Counter() for i in range(dimensions)] key_vectors_iterable = izip(keyed_vectors.index2word, keyed_vectors.vectors) progress = -1 db.execute("BEGIN;") for i, (key, vector) in enumerate(key_vectors_iterable): current_progress = int((float(i) / float(number_of_keys)) * 100) if current_progress > progress: progress = current_progress eprint("%d%% completed" % progress) if i % 100000 == 0: db.execute("COMMIT;") db.execute("BEGIN;") magnitude = np.linalg.norm(vector) vector = vector / magnitude epsilon = np.random.choice( [-1.0 / (10**precision), 1.0 / (10**precision)], dimensions) vector = epsilon if np.isnan(vector).any() else vector for d, v in enumerate(vector): counters[d][int(v * 100)] += 1 db.execute(insert_query, (key,) + tuple(int(round(v * (10**precision))) for v in vector) + (float(magnitude),)) # noqa if subword: ngrams = set( (n.lower() for n in char_ngrams( BOW + key + EOW, subword_start, subword_end))) num_ngrams = len(ngrams) * 4 ngrams = set((n for n in ngrams if not any( [c in SQLITE_TOKEN_SPLITTERS for c in n]))) db.execute(insert_subword_query, (" ".join(ngrams), num_ngrams)) if approx: approx_index.add_item(i, vector) eprint("Committing written vectors... (this may take some time)") db.execute("COMMIT;") # Figure out which dimensions have the most entropy entropies = [(d, entropy(counter)) for d, counter in enumerate(counters)] entropies.sort(key=lambda e: e[1], reverse=True) for e in entropies: eprint("Entropy of dimension %d is %f" % (e[0], e[1])) highest_entropy_dimensions = [e[0] for e in entropies] # Writing metadata insert_format_query = """ INSERT INTO `magnitude_format`( key, value ) VALUES ( ?, ? ); """ db.execute(insert_format_query, ('version', CONVERTER_VERSION)) db.execute(insert_format_query, ('elmo', input_is_elmo)) db.execute(insert_format_query, ('size', number_of_keys)) db.execute(insert_format_query, ('dim', dimensions)) db.execute(insert_format_query, ('precision', precision)) if subword: db.execute(insert_format_query, ('subword', subword)) db.execute(insert_format_query, ('subword_start', subword_start)) db.execute(insert_format_query, ('subword_end', subword_end)) if approx: if approx_trees is None: approx_trees = max(50, int((number_of_keys / 3000000.0) * 50.0)) db.execute(insert_format_query, ('approx', approx)) db.execute(insert_format_query, ('approx_trees', approx_trees)) for d in highest_entropy_dimensions: db.execute(insert_format_query, ('entropy', d)) # Create indicies eprint("Creating search index... (this may take some time)") db.execute("CREATE INDEX `magnitude_key_idx` ON `magnitude` (key);") for i in highest_entropy_dimensions[0:1]: eprint("Creating spatial search index for dimension %d " "(it has high entropy)... (this may take some time)" % i) db.execute(""" CREATE INDEX `magnitude_dim_%d_idx` ON `magnitude` (dim_%d); """ % (i, i)) # Write approximate index to the database if approx: eprint("Creating approximate nearest neighbors index... \ (this may take some time)") approx_index.build(approx_trees) approx_index_file_path = os.path.join( tempfile.mkdtemp(), fast_md5_file(input_file_path) + '.ann') eprint("Dumping approximate nearest neighbors index... \ (this may take some time)") approx_index.save(approx_index_file_path) eprint("Compressing approximate nearest neighbors index... \ (this may take some time)") chunk_size = 104857600 full_size = os.path.getsize(approx_index_file_path) insert_approx_query = """ INSERT INTO magnitude_approx(trees, index_file) VALUES (?, ?); """ with open(approx_index_file_path, 'rb') as ifh, \ lz4.frame.LZ4FrameCompressor() as compressor: for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), b'')): if i == 0: chunk = compressor.begin() + compressor.compress(chunk) else: chunk = compressor.compress(chunk) eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%") if len(chunk) > 0: db.execute(insert_approx_query, (approx_trees, sqlite3.Binary(chunk))) chunk = compressor.flush() if len(chunk) > 0: db.execute(insert_approx_query, (approx_trees, sqlite3.Binary(chunk))) files_to_remove.append(approx_index_file_path) for meta_name, meta_path in metas: if not meta_path: continue eprint("Compressing meta file... \ (this may take some time)") chunk_size = 104857600 full_size = os.path.getsize(meta_path) insert_meta_query = """ INSERT INTO magnitude_""" + meta_name + """(meta_file) VALUES (?); """ with open(meta_path, 'rb') as ifh, \ lz4.frame.LZ4FrameCompressor() as compressor: for i, chunk in enumerate(iter(partial(ifh.read, chunk_size), b'')): if i == 0: chunk = compressor.begin() + compressor.compress(chunk) else: chunk = compressor.compress(chunk) eprint(str((ifh.tell() / float(full_size)) * 100.0) + "%") if len(chunk) > 0: db.execute(insert_meta_query, (sqlite3.Binary(chunk),)) chunk = compressor.flush() if len(chunk) > 0: db.execute(insert_meta_query, (sqlite3.Binary(chunk),)) # Clean up if len(files_to_remove) > 0: eprint("Cleaning up temporary files...") for file_to_remove in files_to_remove: try_deleting(file_to_remove) # Calculate max duplicate keys eprint("Finding duplicate keys... (this may take some time)") duplicate_keys_query = db.execute(""" SELECT MAX(key_count) FROM ( SELECT COUNT(key) AS key_count FROM magnitude GROUP BY key ); """).fetchall() max_duplicate_keys = ( duplicate_keys_query[0][0] if duplicate_keys_query[0][0] is not None else 1) # noqa eprint( "Found %d as the maximum number of duplicate key(s)" % max_duplicate_keys) db.execute(insert_format_query, ('max_duplicate_keys', max_duplicate_keys)) # VACUUM eprint("Vacuuming to save space... (this may take some time)") db.execute("VACUUM;") # Restore safe database settings db.execute("PRAGMA synchronous = FULL;") db.execute("PRAGMA default_synchronous = FULL;") db.execute("PRAGMA journal_mode = DELETE;") db.execute("PRAGMA count_changes = ON;") # Clean up connection conn.commit() conn.close() files_to_remove.append(output_file_path + "-shm") files_to_remove.append(output_file_path + "-wal") # Clean up if len(files_to_remove) > 0: eprint("Cleaning up temporary files...") for file_to_remove in files_to_remove: try_deleting(file_to_remove) # Rename file the temporary output to the real output os.rename(output_file_path, output_file_path_orig) output_file_path = output_file_path_orig # Print success eprint("Successfully converted '%s' to '%s'!" % (input_file_path, output_file_path)) return output_file_path
print('Centering the dataset and queries') center = np.mean(trainDataset, axis=0) trainDataset -= center queries -= center print('Done') print('Constructing the index') t1 = timeit.default_timer() # set the parameters dimension = len(trainDataset[0]) index = AnnoyIndex(dimension, 'angular') for (i, object) in enumerate(trainDataset): index.add_item(i, object) index.build(number_of_trees) t2 = timeit.default_timer() print('Done') print('Construction time: {}'.format((t2 - t1))) print("start querying") for k in search_k: score = 0.0 t1 = timeit.default_timer() for (i, query) in enumerate(queries): score += len(set(index.get_nns_by_vector(query, topk, search_k=k)).intersection(set(groundTruth[i]))) t2 = timeit.default_timer() print("for search_k = {}".format(k)) print('Query time: {} per query'.format((t2 - t1) * 1000 / float( len(queries)))) print("the recall is {}".format(score / topk / float(len(queries))))
# encoding=utf-8 from annoy import AnnoyIndex import random f = 40 t = AnnoyIndex(40) for i in range(1000): v = [random.uniform(-1, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save('./test.ann') u = AnnoyIndex(f) u.load('./test.ann') target = [0.0] * f # 结果是item id a, b = u.get_nns_by_vector(target, 5, include_distances=True) print a, b for idx in a: # 获取具体的向量 print u.get_item_vector(idx)
print("[!] Creating a new image similarity search index.") print("[!] Loading the inception CNN") create_graph("./tensorflow_inception_graph.pb") print("[!] Done.") input_path = sys.argv[2] files = os.listdir(input_path) images = [input_path + i for i in files] results = extract_features(images, True) print("[!] Done extracting features, building search index") ann_index = AnnoyIndex(len(results[0])) for i in xrange(len(images)): ann_index.add_item(i, results[i]) print("[!] Constructing trees") ann_index.build(80) print("[!] Saving the index to '%s'" % sys.argv[3]) ann_index.save(sys.argv[3]) print("[!] Saving the filelist to '%s'" % (sys.argv[3] + ".filelist")) filelist = file(sys.argv[3] + ".filelist", "wt") filelist.write("\n".join(images)) filelist.close() elif sys.argv[1] == "search": print("[!] Searching for similar images.") print("[!] Loading the inception CNN") create_graph("./tensorflow_inception_graph.pb") print("[!] Done.") input_path = sys.argv[2] files = os.listdir(input_path) images = [input_path + i for i in files] results = extract_features(images, True)
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ from annoy import AnnoyIndex a = AnnoyIndex(3, 'angular') a.add_item(0, [1, 0, 0]) a.add_item(1, [0, 1, 0]) a.add_item(2, [0, 0, 1]) a.build(-1) print(a.get_nns_by_item(0, 100)) print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100)) import random f = 40 t = AnnoyIndex(f, 'angular') # Length of item vector that will be indexed for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f, 'angular')
def get_nearest_neighbors( node_ids: List[Union[int, str]], embeddings: np.ndarray, num_nearest: int ) -> Dict[Union[int, str], Set[Union[int, str]]]: """ Compute similar nodes among a set of embeddings. Parameters ---------- node_ids: Names of each embedding sample. embeddings: Embedding vectors of shape (num_total, embed_dim). num_nearest: Number of nearest-neighbors to find. Returns ------- Dictionary mapping node_id's to nearest-neighbors """ num_total = embeddings.shape[0] embed_dim = embeddings.shape[1] if num_total != len(node_ids): raise SimilaritySearchError("Lengths of node_ids and embeddings must match") # normalize embeddings embeddings = StandardScaler().fit_transform(embeddings) # initialize annoy index w/ angular distance metric annoy_index = AnnoyIndex(embed_dim, metric="angular") # add items to index for i in range(num_total): annoy_index.add_item(node_ids[i], embeddings[i]) # build index annoy_index.build(n_trees=16, n_jobs=-1) nodes_neighbors = {} for node_id in node_ids: neighbors = annoy_index.get_nns_by_item(node_id, num_nearest + 1) # remove course itself neighbors = [x for x in neighbors if x != node_id] # set neighbors nodes_neighbors[node_id] = neighbors # symmmmetric filtering for node_id, neighbors in nodes_neighbors.items(): filter_nodes = [x for x in neighbors if node_id in nodes_neighbors[x]] nodes_neighbors[node_id] = filter_nodes nodes_neighbors = { node_id: list(zip(neighbors, range(len(neighbors)))) for node_id, neighbors in nodes_neighbors.items() if len(neighbors) > 0 } return nodes_neighbors
path=model_path, is_train=False, hyper_overrides={}) predictions = [] for language in ('python', 'go', 'javascript', 'java', 'php', 'ruby'): print("Evaluating language: %s" % language) definitions = pickle.load(open('../resources/data/{}_dedupe_definitions_v2.pkl'.format(language), 'rb')) indexes = [{'code_tokens': d['function_tokens'], 'language': d['language']} for d in tqdm(definitions)] code_representations = model.get_code_representations(indexes) indices = AnnoyIndex(code_representations[0].shape[0], 'angular') for index, vector in tqdm(enumerate(code_representations)): if vector is not None: indices.add_item(index, vector) indices.build(10) for query in queries: for idx, _ in zip(*query_model(query, model, indices, language)): predictions.append((query, language, definitions[idx]['identifier'], definitions[idx]['url'])) df = pd.DataFrame(predictions, columns=['query', 'language', 'identifier', 'url']) df.to_csv(predictions_csv, index=False) if run_id: print('Uploading predictions to W&B') # upload model predictions CSV file to W&B # we checked that there are three path components above entity, project, name = args_wandb_run_id.split('/')