Пример #1
0
def gen_item_index(model):
    article_embedding_matrix = model.get_layer('E-Article').get_weights()[0]

    embedding_size = article_embedding_matrix.shape[1]
    index = HnswIndex(embedding_size)
    for embedding in article_embedding_matrix:
        index.add_data(embedding)
    index.build(n_threads=4)

    article_to_id = load_data('article_to_id')
    id_to_article = {v: k for k, v in article_to_id.items()}

    def most_similar(item, topn=100, threshold=0.3):
        if item not in id_to_article:
            return []

        output = []
        iid = id_to_article[item]
        for tiid in [
                e[0] for e in index.search_by_id(
                    iid, topn * 2, include_distances=True) if e[1] < threshold
        ][1:]:
            target_item = id_to_article[tiid]
            output.append(target_item)
            if len(output) == topn:
                break
        return output

    return most_similar
Пример #2
0
class N2(BaseANN):
    def __init__(self, m):
        threads = 8
        self.name = 'N2(m={}, threads={})'.format(m,threads)
        self._m = m
        self._threads = threads
        self._index = None
        print("Init done")

    def fit(self, X):
        X = numpy.array(X)	
        X = X.astype(numpy.float32)
        self._index = HnswIndex(X.shape[1],"L2")
        print("Shape", X.shape[1])
        for el in X:
            self._index.add_data(el) 
        self._index.build(m=self._m, n_threads=self._threads)
        print("Fit done")

    def query(self, v, n):
        v = v.astype(numpy.float32)
        #print(v)
        #print(n)
        #print("-----------------------------------")
        nns = self._index.search_by_vector(v,n)
        #print("[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(v, nns))
        return nns
    def use_threads(self):
        return False
Пример #3
0
    def test01_most_similar(self):
        set_log_level(1)
        model = self.load_text8_model()
        index = HnswIndex(model.L0.shape[1])
        model.normalize('item')
        for f in model.L0:
            index.add_data(f)
        index.build(n_threads=4)
        index.save('n2.bin')

        par = ParW2V(model)

        model.opt.num_workers = 1
        all_keys = model._idmanager.itemids[::][:10000]
        start_t = time.time()
        [model.most_similar(k, topk=10) for k in all_keys]
        naive_elapsed = time.time() - start_t

        par.num_workers = 4
        start_t = time.time()
        par.most_similar(all_keys, topk=10, repr=True)
        par_elapsed = time.time() - start_t

        start_t = time.time()
        par.set_hnsw_index('n2.bin', 'item')
        par.most_similar(all_keys, topk=10, repr=True)
        ann_elapsed = time.time() - start_t
        self.assertTrue(naive_elapsed > par_elapsed * 1.5 > ann_elapsed * 5.0,
                        msg=f'{naive_elapsed} > {par_elapsed} > {ann_elapsed}')
        index.unload()
        os.remove('n2.bin')
Пример #4
0
 def setUpClass(self):
     index = HnswIndex(self.dim)
     for i in xrange(self.data_num):
         v = [random.gauss(0, 1) for z in xrange(self.dim)]
         index.add_data(v)
     index.build(n_threads=12)
     index.save(self.model_fname)
Пример #5
0
    def build_n2(self):
        t = self.tfs_by_doc
        all_words = []
        mapper = {'from_hnsw': {}, 'from_doc_id': {}}

        # build all_words
        for doc_id in t.keys():
            for word in t[doc_id].keys():
                if word not in all_words:
                    all_words.append(word)
        col_len = len(all_words)

        hnsw = HnswIndex(dimension=col_len, metric='angular')
        for h_idx, doc_id in enumerate(
                tqdm(list(t.keys()), desc="Build N2 Search Space")):
            assert h_idx not in mapper['from_hnsw']
            mapper['from_hnsw'][h_idx] = doc_id
            mapper['from_doc_id'][doc_id] = h_idx
            parchment = np.zeros(col_len, dtype=np.uint16)
            for word, count in t[doc_id].items():
                word_idx = all_words.index(word)
                parchment[word_idx] = count
            hnsw.add_data(parchment)
        hnsw.build(n_threads=4)
        self.n2 = {'hnsw': hnsw, 'mapper': mapper, 'all_words': all_words}
Пример #6
0
def get_user_embeddings(user_model, seens_total, user_list, batch_size=10000):
    inputs = [[], [], [], [], []]
    includes = []

    user_embeddings = {}
    for user, seens in tqdm(seens_total.items(), desc='user embedding'):
        seens = seens_total[user]
        if seens:
            includes.append(user)
            sequence_info = get_sequential_feature(user,
                                                   seens['articles'],
                                                   seens['ages'],
                                                   data_type='test',
                                                   random_range=False,
                                                   random_sample_length=False,
                                                   positive=True)
            article_sequence, magazine_sequence, author_sequence, user_feature_sequence, target_age, target = sequence_info
            search_keyword_sequence = get_search_keyword_feature(user)
            inputs[0].append(article_sequence)
            inputs[1].append(magazine_sequence)
            inputs[2].append(author_sequence)
            inputs[3].append(user_feature_sequence)
            inputs[4].append(search_keyword_sequence)

    inputs = [np.asarray(x) for x in inputs]
    predicts = user_model.predict(inputs, batch_size=batch_size)

    user_index = HnswIndex(200)
    for embedding in predicts:
        user_index.add_data(embedding)
    user_index.build(n_threads=multiprocessing.cpu_count())

    user_to_id = {user: i for i, user in enumerate(includes)}
    id_to_user = {v: k for k, v in user_to_id.items()}

    for user in user_list:
        if user in user_to_id:
            user_embeddings[user] = predicts[user_to_id[user]]
        else:
            user_embeddings[user] = None

    def most_similar(user, topn=100, threshold=0.3):
        if user not in user_to_id:
            return []

        output = []
        uid = user_to_id[user]
        for tuid in [
                e[0] for e in user_index.search_by_id(
                    uid, topn * 2, include_distances=True) if e[1] < threshold
        ][1:]:
            target_user = id_to_user[tuid]
            output.append(target_user)
            if len(output) == topn:
                break
        return output

    return user_embeddings, most_similar
Пример #7
0
    def test_search_by_id(self):
        f = 2
        i = HnswIndex(f, 'L2')
        i.add_data([2, 2])
        i.add_data([3, 2])
        i.add_data([3, 3])
        i.build()

        self.assertEqual(i.search_by_id(0, 3), [0, 1, 2])
        self.assertEqual(i.search_by_id(2, 3), [2, 1, 0])
Пример #8
0
    def test_search_by_vector(self):
        f = 2
        i = HnswIndex(f, 'L2')
        i.add_data([2, 2])
        i.add_data([3, 2])
        i.add_data([3, 3])
        i.build()

        self.assertEqual(i.search_by_vector([4, 4], 3), [2, 1, 0])
        self.assertEqual(i.search_by_vector([1, 1], 3), [0, 1, 2])
        self.assertEqual(i.search_by_vector([4, 2], 3), [1, 2, 0])
Пример #9
0
    def test_search_by_vector(self):
        f = 3
        i = HnswIndex(f)
        i.add_data([0, 0, 1])
        i.add_data([0, 1, 0])
        i.add_data([1, 0, 0])
        i.build(max_m0=10, m=5)

        self.assertEqual(i.search_by_vector([3, 2, 1], 3), [2, 1, 0])
        self.assertEqual(i.search_by_vector([1, 2, 3], 3), [0, 1, 2])
        self.assertEqual(i.search_by_vector([2, 0, 1], 3), [2, 0, 1])
Пример #10
0
    def test_search_by_id(self):
        f = 3
        i = HnswIndex(f)
        i.add_data([2, 1, 0])
        i.add_data([1, 2, 0])
        i.add_data([0, 0, 1])
        i.build(max_m0=10)

        self.assertEqual(i.search_by_id(0, 3), [0, 1, 2])
        self.assertEqual(i.search_by_id(1, 3), [1, 0, 2])
        self.assertTrue(i.search_by_id(2, 3) in [[2, 0, 1],
                                                 [2, 1, 0]])  # could be either
Пример #11
0
 def test02_small_invalid_dimension2(self):
     index = HnswIndex(80)
     this_is_abnormal = False
     try:
         v = [random.gauss(0, 1) for z in xrange(100)]
         index.add_data(v)
         this_is_abnormal = True
     except:
         pass
     finally:
         del index
     self.assertFalse(this_is_abnormal)
Пример #12
0
 def test03_small_add_data_after_loading(self):
     index = HnswIndex(self.dim)
     index.load(self.model_fname)
     this_is_abnormal = False
     try:
         v = [random.gauss(0, 1) for z in xrange(self.dim)]
         index.add_data(v)
         this_is_abnormal = True
     except:
         pass
     finally:
         del index
     self.assertFalse(this_is_abnormal)
Пример #13
0
def kNN(matrix: np.ndarray, k: int) -> List[float]:
    index = HnswIndex(matrix.shape[1], 'L2')
    for sample in matrix:
        index.add_data(sample)
    index.build(m=32,
                max_m0=48,
                ef_construction=int(k * 1.1),
                n_threads=cpu_count())

    result = []
    for i in range(0, matrix.shape[0]):
        results = index.search_by_id(i, k, include_distances=True)
        result.append(np.mean(np.sqrt(np.array([dist
                                                for _, dist in results]))))
    return np.sort(result)
Пример #14
0
def example2():
    log.set_log_level(log.INFO)
    als_option = ALSOption().get_default_option()
    data_option = MatrixMarketOptions().get_default_option()
    data_option.input.main = '../tests/ext/ml-20m/main'
    data_option.input.iid = '../tests/ext/ml-20m/iid'
    data_option.data.path = './ml20m.h5py'
    data_option.data.use_cache = True

    als = ALS(als_option, data_opt=data_option)
    als.initialize()
    als.train()
    als.normalize('item')
    als.build_itemid_map()

    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Paralell(Thread=4)'
    )
    par = ParALS(als)
    par.num_workers = 4
    all_items = als._idmanager.itemids
    start_t = time.time()
    with open('als.ml20m.par.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))

    from n2 import HnswIndex
    index = HnswIndex(als.Q.shape[1])
    for f in als.Q:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('ml20m.n2.index')
    index.unload()
    print(
        'Make item recommendation on als.ml20m.par.top10.tsv with Ann(Thread=1)'
    )
    par.set_hnsw_index('ml20m.n2.index', 'item')
    par.num_workers = 4
    start_t = time.time()
    with open('als.ml20m.ann.top10.tsv', 'w') as fout:
        for idx in range(0, len(all_items), 128):
            topks, _ = par.most_similar(all_items[idx:idx + 128], repr=True)
            for q, p in zip(all_items[idx:idx + 128], topks):
                fout.write('%s\t%s\n' % (q, '\t'.join(p)))
    print('took: %.3f secs' % (time.time() - start_t))
Пример #15
0
    def test_large_index(self):
        # Generate pairs of random points where the pair is super close
        f = 10
        # q = [random.gauss(0, 10) for z in xrange(f)]
        i = HnswIndex(f, 'L2')
        for j in xrange(0, 10000, 2):
            p = [random.gauss(0, 1) for z in xrange(f)]
            x = [1 + pi + random.gauss(0, 1e-2)
                 for pi in p]  # todo: should be q[i]
            y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
            i.add_data(x)
            i.add_data(y)

        i.build()
        for j in xrange(0, 10000, 2):
            self.assertEqual(i.search_by_id(j, 2), [j, j + 1])
            self.assertEqual(i.search_by_id(j + 1, 2), [j + 1, j])
Пример #16
0
class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric):
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(
            INDEX_DIR, "youtube_n2_M%d_efCon%d_n_thread%s" %
            (m, ef_construction, n_threads))
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d" % (
            m, ef_construction, n_threads, ef_search)
        self._metric = metric

        d = os.path.dirname(self._index_name)
        if not os.path.exists(d):
            os.makedirs(d)

    def fit(self, X):
        from n2 import HnswIndex
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        else:
            self._n2 = HnswIndex(X.shape[1])
        if os.path.exists(self._index_name):
            logging.debug("Loading index from file")
            self._n2.load(self._index_name)
        else:
            logging.debug("Index file is not exist: {0}".format(
                self._index_name))
            logging.debug("Start fitting")

            for i, x in enumerate(X):
                self._n2.add_data(x.tolist())
            self._n2.build(m=self._m,
                           max_m0=self._m0,
                           ef_construction=self._ef_construction,
                           n_threads=self._n_threads)
            self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v.tolist(), n, self._ef_search)

    def __str__(self):
        return self.name
Пример #17
0
class N2(BaseANN):
    def __init__(self, m, ef_construction, n_threads, ef_search, metric, batch):
        self.name = "N2_M%d_efCon%d_n_thread%s_efSearch%d%s" % (m, ef_construction, n_threads, ef_search,
                                                                '_batch' if batch else '')
        self._m = m
        self._m0 = m * 2
        self._ef_construction = ef_construction
        self._n_threads = n_threads
        self._ef_search = ef_search
        self._index_name = os.path.join(CACHE_DIR, "index_n2_%s_M%d_efCon%d_n_thread%s"
                                        % (args.dataset, m, ef_construction, n_threads))
        self._metric = metric

    def fit(self, X):
        if self._metric == 'euclidean':
            self._n2 = HnswIndex(X.shape[1], 'L2')
        elif self._metric == 'dot':
            self._n2 = HnswIndex(X.shape[1], 'dot')
        else:
            self._n2 = HnswIndex(X.shape[1])

        if os.path.exists(self._index_name):
            n2_logger.info("Loading index from file")
            self._n2.load(self._index_name, use_mmap=False)
            return

        n2_logger.info("Create Index")
        for i, x in enumerate(X):
            self._n2.add_data(x)
        self._n2.build(m=self._m, max_m0=self._m0, ef_construction=self._ef_construction, n_threads=self._n_threads)
        self._n2.save(self._index_name)

    def query(self, v, n):
        return self._n2.search_by_vector(v, n, self._ef_search)

    def batch_query(self, X, n):
        self.b_res = self._n2.batch_search_by_vectors(X, n, self._ef_search, self._n_threads)

    def get_batch_results(self):
        return self.b_res

    def __str__(self):
        return self.name
Пример #18
0
    def precision(self, n, n_trees=10, n_points=10000, n_rounds=10):
        found = 0
        for r in xrange(n_rounds):
            # create random points at distance x from (1000, 0, 0, ...)
            f = 10
            i = HnswIndex(f, 'L2')
            for j in xrange(n_points):
                p = [random.gauss(0, 1) for z in xrange(f - 1)]
                norm = sum([pi**2 for pi in p])**0.5
                x = [1000] + [pi / norm * j for pi in p]
                i.add_data(x)

            i.build()

            nns = i.search_by_vector([1000] + [0] * (f - 1), n)
            self.assertEqual(nns, sorted(nns))  # should be in order
            # The number of gaps should be equal to the last item minus n-1
            found += len([_x for _x in nns if _x < n])

        return 1.0 * found / (n * n_rounds)
Пример #19
0
class N2(BaseANN):
    def __init__(self, m):
        threads = 8
        self.name = 'N2(m={}, threads={})'.format(m, threads)
        self._m = m
        self._threads = threads
        self._index = None

    def fit(self, X):
        X = numpy.array(X)
        X = X.astype(numpy.float32)
        self._index = HnswIndex(X.shape[1], "L2")
        for el in X:
            self._index.add_data(el)
        self._index.build(m=self._m, n_threads=self._threads)

    def query(self, v, n):
        v = v.astype(numpy.float32)
        nns = self._index.search_by_vector(v, n)
        return nns
Пример #20
0
from n2 import HnswIndex
import random

f = 3
t = HnswIndex(f)  # HnswIndex(f, "L2 or angular")
for i in xrange(1000):
    v = [random.gauss(0, 1) for z in xrange(f)]
    t.add_data(v)

t.build(m=5, max_m0=10, n_threads=4)
t.save('test.n2')

u = HnswIndex(f, "angular")
u.load('test.n2')

search_id = 1
k = 3
neighbor_ids = u.search_by_id(search_id, k)
print(
    "[search_by_id]: Nearest neighborhoods of id {}: {}".format(
        search_id,
        neighbor_ids))

example_vector_query = [random.gauss(0, 1) for z in xrange(f)]
nns = u.search_by_vector(example_vector_query, k, include_distances=True)
print(
    "[search_by_vector]: Nearest neighborhoods of vector {}: {}".format(
        example_vector_query,
        nns))
Пример #21
0
def _buffalo(algo_name, database):
    repeat = 3
    options = {'als': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 10},
               'bpr': {'num_workers': 4,
                       'compute_loss_on_training': False,
                       'd': 32,
                       'num_iters': 100},
              }
    opt = options[algo_name]

    # linear
    if algo_name == 'als':
        PAR = ParALS
        model = BuffaloLib().als(database, return_instance_before_train=True, **opt)
    elif algo_name == 'bpr':
        PAR = ParBPRMF
        model = BuffaloLib().bpr(database, return_instance_before_train=True, **opt)
    model.train()
    model.build_itemid_map()
    model.normalize('item')

    # parallel
    par = PAR(model)

    # ann
    index = HnswIndex(model.P.shape[1])
    for f in model.P:
        index.add_data(f)
    index.build(n_threads=4)
    index.save('bm_n2.bin')

    ann = PAR(model)
    ann.set_hnsw_index('bm_n2.bin', 'item')

    total_queries = 10000
    keys = model._idmanager.itemids[::][:total_queries]
    print('Total queries: %s' % len(keys))
    results = {}
    nn_opts = {'topk': 10}
    for p, m in [('S', model), ('P', par), ('A', ann)]:
        results[p] = {}
        opt = nn_opts.copy()
        if not isinstance(m, PAR):
            opt['iterable'] = keys
        for num_workers in [1, 2, 4]:
            if isinstance(m, PAR):
                m.num_workers = num_workers
            else:
                m.opt.num_workers = num_workers
            opt['model'] = m
            elapsed, memory_usage = _get_elapsed_time('most_similar',
                                                      keys,
                                                      BuffaloLib(), repeat, **opt)
            s = elapsed / len(keys)
            results[p][f'S={num_workers}'] = s
            results[p][f'E={num_workers}'] = elapsed
            results[p][f'M={num_workers}'] = memory_usage['max']
            results[p][f'A={num_workers}'] = memory_usage['avg']
            results[p][f'B={num_workers}'] = memory_usage['min']
            print(f'{p}M={num_workers} {elapsed} {memory_usage}')
    return results