Пример #1
0
def range_search(dataset, percent, rg_type, seed=808, root="../../data"):
    prefix = f"{root}/{dataset}/{dataset}"
    xb = fvecs_read(f"{prefix}_base.fvecs")
    xq = fvecs_read(f"{prefix}_query.fvecs")

    rg, idx = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type)
    gt = _load_gt(seed, dataset, root, xb, xq, idx, percent, rg_type=rg_type)

    sizes = [len(i) for i in gt]
    return np.mean(sizes)
Пример #2
0
def run_experiment(dataset,
                   percent,
                   rg_type,
                   recall,
                   seed=808,
                   root="../../data"):
    prefix = f"{root}/{dataset}/{dataset}"
    xb = fvecs_read(f"{prefix}_base.fvecs")
    xq = fvecs_read(f"{prefix}_query.fvecs")

    rg = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type)
    gt = _load_gt(seed, dataset, root, xb, xq, rg, percent, rg_type=rg_type)
    if recall:
        scale = np.percentile(np.abs(xb), 75, axis=0, keepdims=True)
        xb /= scale
        xq /= scale
        rg /= scale

        xq = xq[:100]
        rg = rg[:100]
        gt = gt[:100]

        # performance of random projection is bad
        # after transformed into inner product problem
        # for p in [2, 4, 8, 16]:
        #     print("p = {}, ranked by p-dist".format(p))
        #     x, q = transform(xb, xq, rg, p=p, intervals=False)
        #     dist = -np.dot(q, x.T)
        #     test_recalls(np.argsort(dist), gt)

        #     print("p = {}, ranked by random projection".format(p))
        #     x, q = transform(xb, xq, rg, p=p, intervals=False)
        #     proj = np.random.normal(size=(x.shape[1], 32))
        #     x, q = np.dot(x, proj), np.dot(q, proj)
        #     dist = -np.dot(q, x.T)
        #     test_recalls(np.argsort(dist), gt)

        for p in [2, 4, 8]:
            print("p = {}, ranked by p-dist".format(p))
            dist = weighted_dist(xb, xq, rg, p=p)
            test_recalls(np.argsort(dist), gt)

        for m in [2, 4, 8, 16]:
            for p in [2, 4, 8]:
                # print("pq_lp = p = {}, M = {} ranked by VQ-NT".format(p, m))
                # dist = vq_nt_(xq, xb, rg, p, M=m, pq_lp=p)
                # test_recalls(np.argsort(dist), gt)
                print("pq_lp = 2, p = {}, M = {} ranked by VQ-NT".format(p, m))
                dist = vq_nt_(xq, xb, rg, p, M=m, pq_lp=2)
                test_recalls(np.argsort(dist), gt)

    sizes = [len(i) for i in gt]
    return np.mean(sizes)
Пример #3
0
def save_pq(dataset, root="../data", seed_=808, percent=75, ms=[2, 32]):
    prefix = f"{root}/{dataset}/{dataset}"
    xb = fvecs_read(f"{prefix}_base.fvecs")
    scale = np.percentile(np.abs(xb), percent, axis=0, keepdims=True)
    xb /= scale
    for Ks in [256, 512]:
        for m in filter(lambda x: x < xb.shape[1], ms):
            codes_file = f"{prefix}_s{seed_}_pq{m}_ks{Ks}_codes.fvecs"
            centroids_file = f"{prefix}_s{seed_}_pq{m}_ks{Ks}_centroids.fvecs"
            scaling_file = f"{prefix}_s{seed_}_p{percent}_scale.fvecs"
            qt = PQ(M=m, Ks=Ks, p=2)
            qt.fit(xb, iter=20)
            if not os.path.isfile(centroids_file):
                codewords = qt.codewords.reshape(m * Ks, -1)
                print(
                    f"codewords shape : {qt.codewords.shape}->{codewords.shape}"
                )
                fvecs_writer(centroids_file, codewords)
            if not os.path.isfile(codes_file):
                codes = qt.encode(xb).astype(np.int32)
                print(f"codes shape : {codes.shape}")
                ivecs_writer(codes_file, codes)
            if not os.path.isfile(scaling_file):
                print(f"scale shape : {scale.shape}")
                fvecs_writer(scaling_file, scale)
Пример #4
0
def _load_rg(seed, dataset, root, xb, xq, percent, rg_type, dim):
    nq_, d_ = xq.shape
    prefix = f"{root}/{dataset}/{dataset}"
    rg_file = f"{prefix}_s{seed}_p{percent}_{rg_type}_sparse_rg.fvecs"
    idx_file = f"{prefix}_s{seed}_p{percent}_{rg_type}_sparse_idx.txt"
    if not os.path.isfile(rg_file):
        np.random.seed(seed)
        rnd = np.random.uniform(low=0, high=1, size=(nq_, d_))
        idx = rnd.argsort(axis=1)[:, :dim].copy()
        idx.sort(axis=1)
        list2d_writer(idx_file, idx)

        np.random.seed(seed)
        lower = np.percentile(xb, q=50 - percent / 2., axis=0, keepdims=True)
        upper = np.percentile(xb, q=50 + percent / 2., axis=0, keepdims=True)
        if rg_type == 'uniform':
            rg_ = 1. + np.random.uniform(low=0., high=1., size=(nq_, d_))
        elif rg_type == 'normal':
            rg_ = np.random.normal(loc=0., scale=1., size=(nq_, d_))
            rg_ = 1. + np.abs(rg_)
        elif rg_type == 'zipf':
            rg_ = np.random.zipf(a=2., size=(nq_, d_))
        else:
            assert False, f"unknown range type {rg_type}"
        rg_ = rg_ * (upper - lower)
        fvecs_writer(rg_file, rg_)
    else:
        rg_ = fvecs_read(rg_file)
        idx = list2d_reader(idx_file)
    return rg_, idx
Пример #5
0
def run_inner_product(dataset,
                      percent,
                      rg_type,
                      recall,
                      seed=808,
                      root="../../data"):
    # performance of random projection is bad
    # after transformed into inner product problem
    prefix = f"{root}/{dataset}/{dataset}"
    xb = fvecs_read(f"{prefix}_base.fvecs")
    xq = fvecs_read(f"{prefix}_query.fvecs")

    rg = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type)
    gt = _load_gt(seed, dataset, root, xb, xq, rg, percent, rg_type=rg_type)
    if recall:
        scale = np.percentile(np.abs(xb), 75, axis=0, keepdims=True)
        xb /= scale
        xq /= scale
        rg /= scale

        xq = xq[:100]
        rg = rg[:100]
        gt = gt[:100]

        for p in [2, 4, 8, 16]:
            print("p = {}, ranked by p-dist".format(p))
            x, q = transform(xb, xq, rg, p=p, intervals=False)
            dist = -np.dot(q, x.T)
            test_recalls(np.argsort(dist), gt)

            print("p = {}, ranked by random projection".format(p))
            x, q = transform(xb, xq, rg, p=p, intervals=False)
            proj = np.random.normal(size=(x.shape[1], 32))
            x, q = np.dot(x, proj), np.dot(q, proj)
            dist = -np.dot(q, x.T)
            test_recalls(np.argsort(dist), gt)
Пример #6
0
def _load_rg_depracted(seed, dataset, root, xb, xq, lower, upper, rg_type):
    nq_, d_ = xq.shape
    prefix = f"{root}/{dataset}/{dataset}"
    rg_file = f"{prefix}_s{seed}_l{lower}_u{upper}_{rg_type}_rg.fvecs"
    if not os.path.isfile(rg_file):
        np.random.seed(seed)
        if rg_type == 'uniform':
            rg_ = 1. + np.random.uniform(low=0., high=1., size=(nq_, d_))
        elif rg_type == 'normal':
            rg_ = np.random.normal(loc=0., scale=1., size=(nq_, d_))
            rg_ = 1. + np.abs(rg_)
        elif rg_type == 'zipf':
            rg_ = 1. + np.random.zipf(a=2., size=(nq_, d_))
        else:
            assert False, f"unknown range type {rg_type}"
        rg_ = rg_ * find_scale(xb, xq, rg_, lower, upper).reshape(nq_, 1)
        fvecs_writer(rg_file, rg_)
    else:
        rg_ = fvecs_read(rg_file)
    return rg_
import struct
import numpy as np
from vecs_io import fvecs_read


def srtree_writer(filename, vecs):
    f = open(filename, "w")
    dimension = len(vecs[0])
    print(dimension, end="\n", file=f)

    for i, x in enumerate(vecs):
        print(":".join(list(map(str, x))), end=":", file=f)
        print(f"({i})", end="\n", file=f)
    f.close()


for dataset in ["YearPredictionMSD", "sift-128"]:
    xb = fvecs_read(f"../../data/{dataset}/{dataset}_base.fvecs")
    xq = fvecs_read(f"../../data/{dataset}/{dataset}_query.fvecs")
    rg = fvecs_read(f"../../data/{dataset}/{dataset}_s808_p60_zipf_rg.fvecs")
    srtree_writer(f"../../data/{dataset}/{dataset}_base.rcd", xb)
    srtree_writer(f"../../data/{dataset}/{dataset}_query.rcd", xq)
    srtree_writer(f"../../data/{dataset}/{dataset}_s808_p60_zipf_rg.rcd", rg)