Пример #1
0
 def test1_is_valid_option(self):
     opt = StreamOptions().get_default_option()
     self.assertTrue(StreamOptions().is_valid_option(opt))
     opt['type'] = 1
     self.assertRaises(RuntimeError, StreamOptions().is_valid_option, opt)
     opt['type'] = 'stream'
     self.assertTrue(StreamOptions().is_valid_option(opt))
Пример #2
0
    def test3_to_matrix(self):
        opt = StreamOptions().get_default_option()
        opt.input.main = self.main_path
        opt.input.uid = self.uid_path
        opt.data.internal_data_type = 'matrix'
        mm = Stream(opt)
        mm.create()
        self.assertTrue(True)
        db = mm.handle
        if opt.data.sppmi:
            self.assertEqual(
                sorted(db.keys()),
                sorted(['idmap', 'rowwise', 'colwise', 'vali', 'sppmi']))
        else:
            self.assertEqual(sorted(db.keys()),
                             sorted(['idmap', 'rowwise', 'colwise', 'vali']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 7)  # due to validation samples
        self.assertEqual(header['num_users'], 3)
        self.assertEqual(header['num_items'], 6)

        data = [(u, kk, vv) for u, kk, vv in mm.iterate()]
        self.assertEqual(len(data), 7)
        self.assertEqual([uu for uu, _, _ in data], [0, 0, 0, 0, 1, 2, 2])

        data = [(u, kk, vv) for u, kk, vv in mm.iterate(axis='colwise')]
        data = [(u, kk, vv)
                for u, kk, vv in mm.iterate(axis='colwise', use_repr_name=True)
                ]
        data.sort()
        self.assertEqual(
            [uu for uu, _, _ in data],
            ['apple', 'coke', 'juice', 'juice', 'mango', 'pie', 'pie'])
Пример #3
0
    def test2_create(self):
        opt = StreamOptions().get_default_option()
        opt.input.main = self.main_path
        opt.input.uid = self.uid_path
        mm = Stream(opt)
        mm.create()
        self.temp_files.append(opt.data.path)
        self.assertTrue(True)
        db = mm.handle
        if opt.data.sppmi:
            self.assertEqual(
                sorted(db.keys()),
                sorted(['idmap', 'rowwise', 'colwise', 'vali', 'sppmi']))
        else:
            self.assertEqual(sorted(db.keys()),
                             sorted(['idmap', 'rowwise', 'colwise', 'vali']))
        header = mm.get_header()
        self.assertEqual(header['num_nnz'], 9)  # due to validation samples
        self.assertEqual(header['num_users'], 3)
        self.assertEqual(header['num_items'], 6)

        data = [(u, kk) for u, kk in mm.iterate(use_repr_name=True)]
        self.assertEqual(len(data), 9)
        self.assertEqual([kk for _, kk in data], [
            'apple', 'mango', 'mango', 'apple', 'pie', 'juice', 'pie', 'juice',
            'coke'
        ])
Пример #4
0
    def test10_fast_most_similar(self):
        set_log_level(1)

        opt = CFROption().get_default_option()
        data_opt = StreamOptions().get_default_option()
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()

        keys = [x for x, _ in c.most_similar('49.Star_Wars_(1977)', 10)]
        start_t = time.time()
        for i in range(100):
            for key in keys:
                c.most_similar(key)
        elapsed_a = time.time() - start_t

        c.normalize(group='item')
        start_t = time.time()
        for i in range(100):
            for key in keys:
                c.most_similar(key)
        elapsed_b = time.time() - start_t
        self.assertTrue(elapsed_a > elapsed_b)
Пример #5
0
    def test9_compact_serialization(self):
        set_log_level(1)

        opt = CFROption().get_default_option()
        data_opt = StreamOptions().get_default_option()
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
        c.save('model.bin', with_userid_map=False)
        c = CFR(opt)
        c.load('model.bin', data_fields=['I', '_idmanager'])
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
        self.assertFalse(hasattr(c, 'U'))
        c.normalize(group='item')
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
Пример #6
0
    def test4_train(self):
        set_log_level(3)
        opt = CFROption().get_default_option()
        data_opt = StreamOptions().get_default_option()
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        self.assertTrue(True)
Пример #7
0
    def test3_init(self):
        set_log_level(3)
        opt = CFROption().get_default_option()
        opt.d = 20
        data_opt = StreamOptions().get_default_option()
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.path = './ml100k.h5py'

        c = CFR(opt, data_opt=data_opt)
        self.assertTrue(True)
        c.initialize()
        self.assertEqual(c.U.shape, (943, 20))
        self.assertEqual(c.I.shape, (1682, 20))
Пример #8
0
    def test5_validation(self, ndcg=0.06, map=0.04):
        set_log_level(3)
        opt = CFROption().get_default_option()
        opt.validation = aux.Option({'topk': 10})
        opt.tensorboard = aux.Option({'root': './tb', 'name': 'cfr'})
        data_opt = StreamOptions().get_default_option()
        data_opt.data.validation.name = "sample"
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        results = c.get_validation_results()
        self.assertTrue(results['ndcg'] > ndcg)
        self.assertTrue(results['map'] > map)
Пример #9
0
    def test8_serialization(self):
        set_log_level(1)

        opt = CFROption().get_default_option()
        data_opt = StreamOptions().get_default_option()
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
        c.save('model.bin')
        c.load('model.bin')
        os.remove('model.bin')
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
Пример #10
0
    def load_text8_model(self):
        if os.path.isfile('text8.w2v.bin'):
            w2v = W2V()
            w2v.load('text8.w2v.bin')
            return w2v
        set_log_level(3)
        opt = W2VOption().get_default_option()
        opt.num_workers = 12
        opt.d = 40
        opt.min_count = 4
        opt.num_iters = 10
        opt.model_path = 'text8.w2v.bin'
        data_opt = StreamOptions().get_default_option()
        data_opt.input.main = self.text8 + 'main'
        data_opt.data.path = './text8.h5py'
        data_opt.data.use_cache = True
        data_opt.data.validation = {}

        c = W2V(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        c.save()
        return c
Пример #11
0
    def test6_topk(self):
        set_log_level(1)
        opt = CFROption().get_default_option()
        opt.validation = aux.Option({'topk': 10})
        data_opt = StreamOptions().get_default_option()
        data_opt.data.validation.name = "sample"
        data_opt.data.sppmi = {"windows": 5, "k": 10}
        data_opt.data.internal_data_type = "matrix"
        data_opt.input.main = self.ml_100k + 'stream'
        data_opt.input.uid = self.ml_100k + 'uid'
        data_opt.input.iid = self.ml_100k + 'iid'
        data_opt.data.value_prepro = aux.Option({'name': 'OneBased'})

        c = CFR(opt, data_opt=data_opt)
        c.initialize()
        c.train()
        self.assertTrue(len(c.topk_recommendation('1', 10)), 10)
        ret_a = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_a)
        c.normalize()
        ret_b = [x for x, _ in c.most_similar('49.Star_Wars_(1977)')]
        self.assertIn('180.Return_of_the_Jedi_(1983)', ret_b)
        self.assertEqual(ret_a, ret_b)
Пример #12
0
 def test0_get_default_option(self):
     StreamOptions().get_default_option()
     self.assertTrue(True)
Пример #13
0
    def test5_text8_accuracy(self):
        set_log_level(2)
        opt = W2VOption().get_default_option()
        opt.num_workers = 12
        opt.d = 200
        opt.num_iters = 15
        opt.min_count = 4
        data_opt = StreamOptions().get_default_option()
        data_opt.input.main = self.text8 + 'main'
        data_opt.data.path = './text8.h5py'
        data_opt.data.use_cache = True
        data_opt.data.validation = {}

        model_path = 'text8.accuracy.w2v.bin'
        w = W2V(opt, data_opt=data_opt)
        if os.path.isfile(model_path):
            w.load(model_path)
        else:
            w.initialize()
            w.train()
            w.build_itemid_map()

        with open('./ext/text8/questions-words.txt') as fin:
            questions = fin.read().strip().split('\n')

        met = {}
        target_class = ['capital-common-countries']
        class_name = None
        for line in questions:
            if not line:
                continue
            if line.startswith(':'):
                _, class_name = line.split(' ', 1)
                if class_name in target_class and class_name not in met:
                    met[class_name] = {'hit': 0, 'miss': 0, 'total': 0}
            else:
                if class_name not in target_class:
                    continue
                a, b, c, answer = line.lower().strip().split()
                oov = any(
                    [w.get_feature(t) is None for t in [a, b, c, answer]])
                if oov:
                    continue
                topk = w.most_similar(
                    w.get_weighted_feature({
                        b: 1,
                        c: 1,
                        a: -1
                    }))
                for nn, _ in topk:
                    if nn in [a, b, c]:
                        continue
                    if nn == answer:
                        met[class_name]['hit'] += 1
                    else:
                        met[class_name]['miss'] += 1
                    break  # top-1
                met[class_name]['total'] += 1
        stat = met['capital-common-countries']
        acc = float(stat['hit']) / stat['total']
        print('Top1-Accuracy={:0.3f}'.format(acc))
        self.assertTrue(acc > 0.7)