def range_search(dataset, percent, rg_type, seed=808, root="../../data"): prefix = f"{root}/{dataset}/{dataset}" xb = fvecs_read(f"{prefix}_base.fvecs") xq = fvecs_read(f"{prefix}_query.fvecs") rg, idx = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type) gt = _load_gt(seed, dataset, root, xb, xq, idx, percent, rg_type=rg_type) sizes = [len(i) for i in gt] return np.mean(sizes)
def run_experiment(dataset, percent, rg_type, recall, seed=808, root="../../data"): prefix = f"{root}/{dataset}/{dataset}" xb = fvecs_read(f"{prefix}_base.fvecs") xq = fvecs_read(f"{prefix}_query.fvecs") rg = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type) gt = _load_gt(seed, dataset, root, xb, xq, rg, percent, rg_type=rg_type) if recall: scale = np.percentile(np.abs(xb), 75, axis=0, keepdims=True) xb /= scale xq /= scale rg /= scale xq = xq[:100] rg = rg[:100] gt = gt[:100] # performance of random projection is bad # after transformed into inner product problem # for p in [2, 4, 8, 16]: # print("p = {}, ranked by p-dist".format(p)) # x, q = transform(xb, xq, rg, p=p, intervals=False) # dist = -np.dot(q, x.T) # test_recalls(np.argsort(dist), gt) # print("p = {}, ranked by random projection".format(p)) # x, q = transform(xb, xq, rg, p=p, intervals=False) # proj = np.random.normal(size=(x.shape[1], 32)) # x, q = np.dot(x, proj), np.dot(q, proj) # dist = -np.dot(q, x.T) # test_recalls(np.argsort(dist), gt) for p in [2, 4, 8]: print("p = {}, ranked by p-dist".format(p)) dist = weighted_dist(xb, xq, rg, p=p) test_recalls(np.argsort(dist), gt) for m in [2, 4, 8, 16]: for p in [2, 4, 8]: # print("pq_lp = p = {}, M = {} ranked by VQ-NT".format(p, m)) # dist = vq_nt_(xq, xb, rg, p, M=m, pq_lp=p) # test_recalls(np.argsort(dist), gt) print("pq_lp = 2, p = {}, M = {} ranked by VQ-NT".format(p, m)) dist = vq_nt_(xq, xb, rg, p, M=m, pq_lp=2) test_recalls(np.argsort(dist), gt) sizes = [len(i) for i in gt] return np.mean(sizes)
def save_pq(dataset, root="../data", seed_=808, percent=75, ms=[2, 32]): prefix = f"{root}/{dataset}/{dataset}" xb = fvecs_read(f"{prefix}_base.fvecs") scale = np.percentile(np.abs(xb), percent, axis=0, keepdims=True) xb /= scale for Ks in [256, 512]: for m in filter(lambda x: x < xb.shape[1], ms): codes_file = f"{prefix}_s{seed_}_pq{m}_ks{Ks}_codes.fvecs" centroids_file = f"{prefix}_s{seed_}_pq{m}_ks{Ks}_centroids.fvecs" scaling_file = f"{prefix}_s{seed_}_p{percent}_scale.fvecs" qt = PQ(M=m, Ks=Ks, p=2) qt.fit(xb, iter=20) if not os.path.isfile(centroids_file): codewords = qt.codewords.reshape(m * Ks, -1) print( f"codewords shape : {qt.codewords.shape}->{codewords.shape}" ) fvecs_writer(centroids_file, codewords) if not os.path.isfile(codes_file): codes = qt.encode(xb).astype(np.int32) print(f"codes shape : {codes.shape}") ivecs_writer(codes_file, codes) if not os.path.isfile(scaling_file): print(f"scale shape : {scale.shape}") fvecs_writer(scaling_file, scale)
def _load_rg(seed, dataset, root, xb, xq, percent, rg_type, dim): nq_, d_ = xq.shape prefix = f"{root}/{dataset}/{dataset}" rg_file = f"{prefix}_s{seed}_p{percent}_{rg_type}_sparse_rg.fvecs" idx_file = f"{prefix}_s{seed}_p{percent}_{rg_type}_sparse_idx.txt" if not os.path.isfile(rg_file): np.random.seed(seed) rnd = np.random.uniform(low=0, high=1, size=(nq_, d_)) idx = rnd.argsort(axis=1)[:, :dim].copy() idx.sort(axis=1) list2d_writer(idx_file, idx) np.random.seed(seed) lower = np.percentile(xb, q=50 - percent / 2., axis=0, keepdims=True) upper = np.percentile(xb, q=50 + percent / 2., axis=0, keepdims=True) if rg_type == 'uniform': rg_ = 1. + np.random.uniform(low=0., high=1., size=(nq_, d_)) elif rg_type == 'normal': rg_ = np.random.normal(loc=0., scale=1., size=(nq_, d_)) rg_ = 1. + np.abs(rg_) elif rg_type == 'zipf': rg_ = np.random.zipf(a=2., size=(nq_, d_)) else: assert False, f"unknown range type {rg_type}" rg_ = rg_ * (upper - lower) fvecs_writer(rg_file, rg_) else: rg_ = fvecs_read(rg_file) idx = list2d_reader(idx_file) return rg_, idx
def run_inner_product(dataset, percent, rg_type, recall, seed=808, root="../../data"): # performance of random projection is bad # after transformed into inner product problem prefix = f"{root}/{dataset}/{dataset}" xb = fvecs_read(f"{prefix}_base.fvecs") xq = fvecs_read(f"{prefix}_query.fvecs") rg = _load_rg(seed, dataset, root, xb, xq, percent, rg_type=rg_type) gt = _load_gt(seed, dataset, root, xb, xq, rg, percent, rg_type=rg_type) if recall: scale = np.percentile(np.abs(xb), 75, axis=0, keepdims=True) xb /= scale xq /= scale rg /= scale xq = xq[:100] rg = rg[:100] gt = gt[:100] for p in [2, 4, 8, 16]: print("p = {}, ranked by p-dist".format(p)) x, q = transform(xb, xq, rg, p=p, intervals=False) dist = -np.dot(q, x.T) test_recalls(np.argsort(dist), gt) print("p = {}, ranked by random projection".format(p)) x, q = transform(xb, xq, rg, p=p, intervals=False) proj = np.random.normal(size=(x.shape[1], 32)) x, q = np.dot(x, proj), np.dot(q, proj) dist = -np.dot(q, x.T) test_recalls(np.argsort(dist), gt)
def _load_rg_depracted(seed, dataset, root, xb, xq, lower, upper, rg_type): nq_, d_ = xq.shape prefix = f"{root}/{dataset}/{dataset}" rg_file = f"{prefix}_s{seed}_l{lower}_u{upper}_{rg_type}_rg.fvecs" if not os.path.isfile(rg_file): np.random.seed(seed) if rg_type == 'uniform': rg_ = 1. + np.random.uniform(low=0., high=1., size=(nq_, d_)) elif rg_type == 'normal': rg_ = np.random.normal(loc=0., scale=1., size=(nq_, d_)) rg_ = 1. + np.abs(rg_) elif rg_type == 'zipf': rg_ = 1. + np.random.zipf(a=2., size=(nq_, d_)) else: assert False, f"unknown range type {rg_type}" rg_ = rg_ * find_scale(xb, xq, rg_, lower, upper).reshape(nq_, 1) fvecs_writer(rg_file, rg_) else: rg_ = fvecs_read(rg_file) return rg_
import struct import numpy as np from vecs_io import fvecs_read def srtree_writer(filename, vecs): f = open(filename, "w") dimension = len(vecs[0]) print(dimension, end="\n", file=f) for i, x in enumerate(vecs): print(":".join(list(map(str, x))), end=":", file=f) print(f"({i})", end="\n", file=f) f.close() for dataset in ["YearPredictionMSD", "sift-128"]: xb = fvecs_read(f"../../data/{dataset}/{dataset}_base.fvecs") xq = fvecs_read(f"../../data/{dataset}/{dataset}_query.fvecs") rg = fvecs_read(f"../../data/{dataset}/{dataset}_s808_p60_zipf_rg.fvecs") srtree_writer(f"../../data/{dataset}/{dataset}_base.rcd", xb) srtree_writer(f"../../data/{dataset}/{dataset}_query.rcd", xq) srtree_writer(f"../../data/{dataset}/{dataset}_s808_p60_zipf_rg.rcd", rg)