def _test8_serialization(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.save('model.bin') c.load('model.bin') os.remove('model.bin') ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a)
def _test9_compact_serialization(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.save('model.bin', with_userid_map=False) c = cls(opt) c.load('model.bin', data_fields=['Q', '_idmanager']) ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) self.assertFalse(hasattr(c, 'P')) c.normalize(group='item') ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a)
def test10_fast_most_similar(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', 10)] start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_a = time.time() - start_t c.normalize(group='item') start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_b = time.time() - start_t self.assertTrue(elapsed_a > elapsed_b)
def _test6_topk(self, cls, opt): set_log_level(2) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(len(c.topk_recommendation('1', 10)), 10) ret_a = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_a) c.normalize() ret_b = [ x for x, _ in c.most_similar('180.Return_of_the_Jedi_(1983)', topk=100) ] self.assertIn('49.Star_Wars_(1977)', ret_b) self.assertEqual(ret_a[:10], ret_b[:10])
def test08_serialization(self): opt = BPRMFOption().get_default_option() opt.num_iters = 200 opt.d = 5 opt.validation = aux.Option({'topk': 10}) self._test8_serialization(BPRMF, opt)
def get_default_option(self) -> aux.Option: opt = { 'type': 'stream', 'input': { 'main': '', 'uid': '', # if not set, row-id is used as userid. 'iid': '' # if not set, col-id is used as userid. }, 'data': { 'validation': { 'name': 'newest', # sample or oldest or newest 'p': 0.01, # if set oldest or newest, ignored 'n': 1, # if set sample, ignored 'max_samples': 500 }, 'sppmi': { # 'windows': 5, # 'k': 1 }, 'batch_mb': 1024, 'use_cache': False, 'tmp_dir': '/tmp/', 'path': './stream.h5py', 'internal_data_type': 'stream' # if set to 'matrix', internal data stored as like matrix market format } } return aux.Option(opt)
def _test10_fast_most_similar(self, cls, opt): set_log_level(1) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', topk=100)] start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_a = time.time() - start_t c.normalize(group='item') start_t = time.time() for i in range(100): for key in keys: c.most_similar(key) elapsed_b = time.time() - start_t self.assertTrue(elapsed_a > elapsed_b)
def test09_compact_serialization(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.save('model.bin', with_userid_map=False) c = CFR(opt) c.load('model.bin', data_fields=['I', '_idmanager']) ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) self.assertFalse(hasattr(c, 'U')) c.normalize(group='item') ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
def test11_gpu_validation(self): if not inited_CUBPR: return np.random.seed(7) opt = BPRMFOption().get_default_option() opt.d = 100 opt.verify_neg = False opt.accelerator = True opt.lr = 0.01 opt.reg_b = 10.0 opt.num_iters = 500 opt.evaluation_period = 50 opt.random_seed = 777 opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'bpr'}) self._test5_validation(BPRMF, opt, ndcg=0.03, map=0.02)
def __init__(self, *args, **kwargs): self._idmanager = aux.Option({ 'userid': [], 'userid_map': {}, 'itemid': [], 'itemid_map': {}, 'userid_mapped': False, 'itemid_mapped': False })
def test11_train_ml_20m_on_gpu(self): opt = ALSOption().get_default_option() opt.num_workers = 8 opt.d = 100 opt.validation = aux.Option({'topk': 10}) opt.compute_loss_on_training = True opt.accelerator = True opt.num_cg_max_iters = 3 self._test7_train_ml_20m(ALS, opt)
def run(self, opt_path): opt = aux.Option(opt_path) als = _ALS(opt_path) als.init_factors() loss = als.train() self.logger.info(f'ALS finished with loss({loss}).') if opt.save_factors: self.logger.info(f'Saving model to {opt.model_path}.') als.save(opt.model_path)
def test4_optimize(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.num_workers = 2 opt.model_path = 'als.bin' opt.validation = aux.Option({'topk': 10}) optimize_option = aux.Option({ 'loss': 'val_rmse', 'max_trials': 10, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 20]], 'reg_u': ['uniform', ['reg_u', 0.1, 0.3]], 'reg_i': ['uniform', ['reg_i', 0.1, 0.3]], 'alpha': ['randint', ['alpha', 8, 10]] } }) opt.optimize = optimize_option opt.evaluation_period = 1 opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.init_factors() als.train() default_result = als.get_validation_results() als.optimize() base_loss = default_result['rmse'] # val_rmse optimize_loss = als.get_optimization_data()['best']['val_rmse'] self.assertTrue(base_loss > optimize_loss) als.load('als.bin') loss = als.get_validation_results() self.assertAlmostEqual(loss['rmse'], optimize_loss) os.remove('als.bin')
def test00_tensorboard(self): set_log_level(2) opt = ALSOption().get_default_option() opt.d = 5 opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'als'}) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) als = ALS(opt, data_opt=data_opt) als.initialize() als.train() results = als.get_validation_results() self.assertTrue(results['ndcg'] > 0.025) self.assertTrue(results['map'] > 0.015)
def test5_validation(self, ndcg=0.06, map=0.04): set_log_level(3) opt = CFROption().get_default_option() opt.validation = aux.Option({'topk': 10}) opt.tensorboard = aux.Option({'root': './tb', 'name': 'cfr'}) data_opt = StreamOptions().get_default_option() data_opt.data.validation.name = "sample" data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() results = c.get_validation_results() self.assertTrue(results['ndcg'] > ndcg) self.assertTrue(results['map'] > map)
def test12_gpu_train_ml_20m(self): if not inited_CUBPR: return opt = BPRMFOption().get_default_option() opt.accelerator = True opt.d = 100 opt.verify_neg = False opt.num_iters = 30 opt.evaluation_period = 5 opt.validation = aux.Option({'topk': 10}) self._test7_train_ml_20m(BPRMF, opt)
def _get_initial_tensorboard_data(self): tb = aux.Option({'summary_writer': None, 'name': None, 'metrics': {}, 'feed_dict': {}, 'merged_summary_op': None, 'session': None, 'pbar': None, 'data_root': None, 'step': 1}) return tb
def _test4_train(self, cls, opt): set_log_level(3) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(True)
def __init__(self, opt, *args, **kwargs): self.opt = aux.Option(opt) self.tmp_root = opt.data.tmp_dir if not os.path.isdir(self.tmp_root): os.makedirs(self.tmp_root) self.handle = None self.header = None self.prepro = prepro.PreProcess(self.opt.data) self.value_prepro = self.prepro if self.opt.data.value_prepro: self.prepro = getattr(prepro, self.opt.data.value_prepro.name)(self.opt.data.value_prepro) self.value_prepro = self.prepro self.data_type = None
def test6_topk(self): set_log_level(1) opt = CFROption().get_default_option() opt.validation = aux.Option({'topk': 10}) data_opt = StreamOptions().get_default_option() data_opt.data.validation.name = "sample" data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(len(c.topk_recommendation('1', 10)), 10) ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.normalize() ret_b = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_b) self.assertEqual(ret_a, ret_b)
def test4_train(self): set_log_level(3) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() self.assertTrue(True)
def load(opt): from buffalo.data.mm import MatrixMarket from buffalo.data.stream import Stream if isinstance(opt, (str, )): opt = aux.Option(opt) assert isinstance( opt, (dict, aux.Option )), 'opt must be either str, or dict/aux.Option but {}'.format( type(opt)) if opt['type'] == 'matrix_market': return MatrixMarket(opt) if opt['type'] == 'stream': return Stream(opt) raise RuntimeError('Unexpected data.type: {}'.format(opt['type']))
def _test5_validation(self, cls, opt, ndcg=0.06, map=0.04): set_log_level(2) data_opt = MatrixMarketOptions().get_default_option() data_opt.input.main = self.ml_100k + 'main' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = cls(opt, data_opt=data_opt) c.initialize() c.train() results = c.get_validation_results() self.assertTrue(results['ndcg'] > ndcg, msg='NDCG Test') self.assertTrue(results['map'] > map, msg='MAP Test')
def get_default_option(self) -> aux.Option: opt = { 'type': 'matrix_market', 'input': { 'main': '', 'uid': '', # if not set, row-id is used as userid. 'iid': '' # if not set, col-id is used as itemid. }, 'data': { 'internal_data_type': 'matrix', 'validation': { 'name': 'sample', 'p': 0.01, 'max_samples': 500 }, 'batch_mb': 1024, 'use_cache': False, 'tmp_dir': '/tmp/', 'path': './mm.h5py' } } return aux.Option(opt)
def test8_serialization(self): set_log_level(1) opt = CFROption().get_default_option() data_opt = StreamOptions().get_default_option() data_opt.data.sppmi = {"windows": 5, "k": 10} data_opt.data.internal_data_type = "matrix" data_opt.input.main = self.ml_100k + 'stream' data_opt.input.uid = self.ml_100k + 'uid' data_opt.input.iid = self.ml_100k + 'iid' data_opt.data.value_prepro = aux.Option({'name': 'OneBased'}) c = CFR(opt, data_opt=data_opt) c.initialize() c.train() ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a) c.save('model.bin') c.load('model.bin') os.remove('model.bin') ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')] self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
def get_default_option(self) -> aux.Option: opt = { 'type': 'matrix_market', 'input': { 'main': '', # str or numpy-kind data 'uid': '', # if not set, row-id is used as userid. It is okay to pass list or 1d dence array as a id list information. 'iid': '' # if not set, col-id is used as itemid. It is okay to pass list or 1d dence array as a id list information. }, 'data': { 'internal_data_type': 'matrix', 'validation': { 'name': 'sample', 'p': 0.01, 'max_samples': 500 }, 'batch_mb': 1024, 'use_cache': False, 'tmp_dir': '/tmp/', 'path': './mm.h5py' } } return aux.Option(opt)
def test0_onebased(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({'name': 'OneBased'}) mm = MatrixMarket(opt) mm.create() self.temp_files.append(opt.data.path) self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertEqual([int(vv) for _, _, vv in data], [1, 1, 1, 1, 1]) self.assertEqual(data[2], (2, 2, 1.0))
def test2_implicit_als(self): opt = MatrixMarketOptions().get_default_option() opt.input.main = self.mm_path opt.input.uid = self.uid_path opt.input.iid = self.iid_path opt.data.value_prepro = aux.Option({ 'name': 'ImplicitALS', 'epsilon': 0.5 }) mm = MatrixMarket(opt) mm.create() self.assertTrue(True) db = mm.handle self.assertEqual(sorted(db.keys()), sorted(['vali', 'idmap', 'rowwise', 'colwise'])) header = mm.get_header() self.assertEqual(header['num_nnz'], 5) self.assertEqual(header['num_users'], 5) self.assertEqual(header['num_items'], 3) data = [(u, kk, vv) for u, kk, vv in mm.iterate()] self.assertEqual(len(data), 5) self.assertEqual([int(kk) for _, kk, _ in data], [0, 0, 2, 1, 1]) self.assertAlmostEqual(data[2][2], math.log(1 + 1.0 / 0.5))
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) W2VOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = W2VOption().get_default_option() self.logger = log.get_logger('W2V') self.opt, self.opt_path = self.get_option(opt_path) self.obj = CyW2V() assert self.obj.init(bytes( self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) assert self.data.data_type == 'stream' self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('W2V(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['stream'] self._vocab = aux.Option({ 'size': 0, 'index': None, 'inv_index': None, 'scale': None, 'dist': None, 'total_word_count': 0 })
def example1(): log.set_log_level(log.DEBUG) als_option = ALSOption().get_default_option() als_option.validation = aux.Option({'topk': 10}) data_option = MatrixMarketOptions().get_default_option() data_option.input.main = '../tests/ext/ml-100k/main' data_option.input.iid = '../tests/ext/ml-100k/iid' als = ALS(als_option, data_opt=data_option) als.initialize() als.train() print('MovieLens 100k metrics for validations\n%s' % json.dumps(als.get_validation_results(), indent=2)) print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') print('Run hyper parameter optimization for val_ndcg...') als.opt.num_workers = 4 als.opt.evaluation_period = 10 als.opt.optimize = aux.Option({ 'loss': 'val_ndcg', 'max_trials': 100, 'deployment': True, 'start_with_default_parameters': True, 'space': { 'd': ['randint', ['d', 10, 128]], 'reg_u': ['uniform', ['reg_u', 0.1, 1.0]], 'reg_i': ['uniform', ['reg_i', 0.1, 1.0]], 'alpha': ['randint', ['alpha', 1, 10]], } }) log.set_log_level(log.INFO) als.opt.model_path = './example1.ml100k.als.optimize.bin' print( json.dumps( { 'alpha': als.opt.alpha, 'd': als.opt.d, 'reg_u': als.opt.reg_u, 'reg_i': als.opt.reg_i }, indent=2)) als.optimize() als.load('./example1.ml100k.als.optimize.bin') print('Similar movies to Star_Wars_(1977)') for rank, (movie_name, score) in enumerate(als.most_similar('49.Star_Wars_(1977)')): print(f'{rank + 1:02d}. {score:.3f} {movie_name}') optimization_res = als.get_optimization_data() best_parameters = optimization_res['best_parameters'] print(json.dumps(optimization_res['best'], indent=2)) print( json.dumps( { 'alpha': best_parameters['alpha'], 'd': best_parameters['d'], 'reg_u': best_parameters['reg_u'], 'reg_i': best_parameters['reg_i'] }, indent=2))