def test01_most_similar(self): set_log_level(1) model = self.load_text8_model() index = HnswIndex(model.L0.shape[1]) model.normalize('item') for f in model.L0: index.add_data(f) index.build(n_threads=4) index.save('n2.bin') par = ParW2V(model) model.opt.num_workers = 1 all_keys = model._idmanager.itemids[::][:10000] start_t = time.time() [model.most_similar(k, topk=10) for k in all_keys] naive_elapsed = time.time() - start_t par.num_workers = 4 start_t = time.time() par.most_similar(all_keys, topk=10, repr=True) par_elapsed = time.time() - start_t start_t = time.time() par.set_hnsw_index('n2.bin', 'item') par.most_similar(all_keys, topk=10, repr=True) ann_elapsed = time.time() - start_t self.assertTrue(naive_elapsed > par_elapsed * 1.5 > ann_elapsed * 5.0, msg=f'{naive_elapsed} > {par_elapsed} > {ann_elapsed}') index.unload() os.remove('n2.bin')
def example2(): log.set_log_level(log.INFO) als_option = ALSOption().get_default_option() data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-20m/main' data_option.input.iid = '../tests/ext/ml-20m/iid' data_option.data.path = './ml20m.h5py' data_option.data.use_cache = True als = ALS(als_option, data_opt=data_option) als.initialize() als.train() als.normalize('item') als.build_itemid_map() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)' ) par = ParALS(als) par.num_workers = 4 all_items = als._idmanager.itemids start_t = time.time() with open('als.ml20m.par.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t)) from n2 import HnswIndex index = HnswIndex(als.Q.shape[1]) for f in als.Q: index.add_data(f) index.build(n_threads=4) index.save('ml20m.n2.index') index.unload() print( 'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)' ) par.set_hnsw_index('ml20m.n2.index', 'item') par.num_workers = 4 start_t = time.time() with open('als.ml20m.ann.top10.tsv', 'w') as fout: for idx in range(0, len(all_items), 128): topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True) for q, p in zip(all_items[idx:idx + 128], topks): fout.write('%s\t%s\n' % (q, '\t'.join(p))) print('took: %.3f secs' % (time.time() - start_t))