def setUpClass(self): index = HnswIndex(self.dim) for i in xrange(self.data_num): v = [random.gauss(0, 1) for z in xrange(self.dim)] index.add_data(v) index.build(n_threads=12) index.save(self.model_fname)
def test01_most_similar(self): set_log_level(1) model = self.load_text8_model() index = HnswIndex(model.L0.shape[1]) model.normalize('item') for f in model.L0: index.add_data(f) index.build(n_threads=4) index.save('n2.bin') par = ParW2V(model) model.opt.num_workers = 1 all_keys = model._idmanager.itemids[::][:10000] start_t = time.time() [model.most_similar(k, topk=10) for k in all_keys] naive_elapsed = time.time() - start_t par.num_workers = 4 start_t = time.time() par.most_similar(all_keys, topk=10, repr=True) par_elapsed = time.time() - start_t start_t = time.time() par.set_hnsw_index('n2.bin', 'item') par.most_similar(all_keys, topk=10, repr=True) ann_elapsed = time.time() - start_t self.assertTrue(naive_elapsed > par_elapsed * 1.5 > ann_elapsed * 5.0, msg=f'{naive_elapsed} > {par_elapsed} > {ann_elapsed}') index.unload() os.remove('n2.bin')
def example2(): log.set_log_level(log.INFO) als_option = ALSOption().get_default_option() data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-20m/main' data_option.input.iid = '../tests/ext/ml-20m/iid' data_option.data.path = './ml20m.h5py' data_option.data.use_cache = True als = ALS(als_option, data_opt=data_option) als.initialize() als.train() als.normalize('item') als.build_itemid_map() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)' ) par = ParALS(als) par.num_workers = 4 all_items = als._idmanager.itemids start_t = time.time() with open('als.ml20m.par.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t)) from n2 import HnswIndex index = HnswIndex(als.Q.shape[1]) for f in als.Q: index.add_data(f) index.build(n_threads=4) index.save('ml20m.n2.index') index.unload() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)' ) par.set_hnsw_index('ml20m.n2.index', 'item') par.num_workers = 4 start_t = time.time() with open('als.ml20m.ann.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t))
class N2(BaseANN): def __init__(self, m, ef_construction, n_threads, ef_search, metric): self._m = m self._m0 = m * 2 self._ef_construction = ef_construction self._n_threads = n_threads self._ef_search = ef_search self._index_name = os.path.join( INDEX_DIR, "youtube_n2_M%d_efCon%d_n_thread%s" % (m, ef_construction, n_threads)) self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d" % ( m, ef_construction, n_threads, ef_search) self._metric = metric d = os.path.dirname(self._index_name) if not os.path.exists(d): os.makedirs(d) def fit(self, X): from n2 import HnswIndex if self._metric == 'euclidean': self._n2 = HnswIndex(X.shape[1], 'L2') else: self._n2 = HnswIndex(X.shape[1]) if os.path.exists(self._index_name): logging.debug("Loading index from file") self._n2.load(self._index_name) else: logging.debug("Index file is not exist: {0}".format( self._index_name)) logging.debug("Start fitting") for i, x in enumerate(X): self._n2.add_data(x.tolist()) self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads) self._n2.save(self._index_name) def query(self, v, n): return self._n2.search_by_vector(v.tolist(), n, self._ef_search) def __str__(self): return self.name
class N2(BaseANN): def __init__(self, m, ef_construction, n_threads, ef_search, metric, batch): self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d%s" % (m, ef_construction, n_threads, ef_search, '_batch' if batch else '') self._m = m self._m0 = m * 2 self._ef_construction = ef_construction self._n_threads = n_threads self._ef_search = ef_search self._index_name = os.path.join(CACHE_DIR, "index_n2_%s_M%d_efCon%d_n_thread%s" % (args.dataset, m, ef_construction, n_threads)) self._metric = metric def fit(self, X): if self._metric == 'euclidean': self._n2 = HnswIndex(X.shape[1], 'L2') elif self._metric == 'dot': self._n2 = HnswIndex(X.shape[1], 'dot') else: self._n2 = HnswIndex(X.shape[1]) if os.path.exists(self._index_name): n2_logger.info("Loading index from file") self._n2.load(self._index_name, use_mmap=False) return n2_logger.info("Create Index") for i, x in enumerate(X): self._n2.add_data(x) self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads) self._n2.save(self._index_name) def query(self, v, n): return self._n2.search_by_vector(v, n, self._ef_search) def batch_query(self, X, n): self.b_res = self._n2.batch_search_by_vectors(X, n, self._ef_search, self._n_threads) def get_batch_results(self): return self.b_res def __str__(self): return self.name
from n2 import HnswIndex import random f = 3 t = HnswIndex(f) # HnswIndex(f, "L2 or angular") for i in xrange(1000): v = [random.gauss(0, 1) for z in xrange(f)] t.add_data(v) t.build(m=5, max_m0=10, n_threads=4) t.save('test.n2') u = HnswIndex(f, "angular") u.load('test.n2') search_id = 1 k = 3 neighbor_ids = u.search_by_id(search_id, k) print( "[search_by_id]: Nearest neighborhoods of id {}: {}".format( search_id, neighbor_ids)) example_vector_query = [random.gauss(0, 1) for z in xrange(f)] nns = u.search_by_vector(example_vector_query, k, include_distances=True) print( "[search_by_vector]: Nearest neighborhoods of vector {}: {}".format( example_vector_query, nns))
def _buffalo(algo_name, database): repeat = 3 options = {'als': {'num_workers': 4, 'compute_loss_on_training': False, 'd': 32, 'num_iters': 10}, 'bpr': {'num_workers': 4, 'compute_loss_on_training': False, 'd': 32, 'num_iters': 100}, } opt = options[algo_name] # linear if algo_name == 'als': PAR = ParALS model = BuffaloLib().als(database, return_instance_before_train=True, **opt) elif algo_name == 'bpr': PAR = ParBPRMF model = BuffaloLib().bpr(database, return_instance_before_train=True, **opt) model.train() model.build_itemid_map() model.normalize('item') # parallel par = PAR(model) # ann index = HnswIndex(model.P.shape[1]) for f in model.P: index.add_data(f) index.build(n_threads=4) index.save('bm_n2.bin') ann = PAR(model) ann.set_hnsw_index('bm_n2.bin', 'item') total_queries = 10000 keys = model._idmanager.itemids[::][:total_queries] print('Total queries: %s' % len(keys)) results = {} nn_opts = {'topk': 10} for p, m in [('S', model), ('P', par), ('A', ann)]: results[p] = {} opt = nn_opts.copy() if not isinstance(m, PAR): opt['iterable'] = keys for num_workers in [1, 2, 4]: if isinstance(m, PAR): m.num_workers = num_workers else: m.opt.num_workers = num_workers opt['model'] = m elapsed, memory_usage = _get_elapsed_time('most_similar', keys, BuffaloLib(), repeat, **opt) s = elapsed / len(keys) results[p][f'S={num_workers}'] = s results[p][f'E={num_workers}'] = elapsed results[p][f'M={num_workers}'] = memory_usage['max'] results[p][f'A={num_workers}'] = memory_usage['avg'] results[p][f'B={num_workers}'] = memory_usage['min'] print(f'{p}M={num_workers} {elapsed} {memory_usage}') return results