def test_unquantize(): X, Q = _load_digits_X_Q(nqueries=20) enc = bolt.Encoder('dot', accuracy='high').fit(X) dots_true = [np.dot(X, q) for q in Q] dots_bolt = [enc.transform(q, unquantize=True) for q in Q] diffs = [ true_vals - bolt_vals for true_vals, bolt_vals in zip(dots_true, dots_bolt) ] mse = np.mean([np.mean(diff * diff) for diff in diffs]) var = np.mean([np.var(true_vals) for true_vals in dots_true]) print("dot product unquantize mse / variance: ", mse / var) assert (mse / var) < .01 # print "true, bolt dot prods" # print dots_true[0][:20].astype(np.int32) # print dots_bolt[0][:20].astype(np.int32) enc = bolt.Encoder('l2', accuracy='high').fit(X) dists_true = [_dists_sq(X, q) for q in Q] dists_bolt = [enc.transform(q, unquantize=True) for q in Q] diffs = [ true_vals - bolt_vals for true_vals, bolt_vals in zip(dists_true, dists_bolt) ] mse = np.mean([np.mean(diff * diff) for diff in diffs]) var = np.mean([np.var(true_vals) for true_vals in dots_true]) print("squared l2 unquantize mse / variance: ", mse / var) assert (mse / var) < .01
def _create_randn_encoder(Ntrain=100, Ntest=20, D=64): enc = bolt.Encoder() X_train = np.random.randn(Ntrain, D) X_test = np.random.randn(Ntest, D) enc.fit(X_train, just_train=True) enc.set_data(X_test) return enc
def test_time_space_savings(): # mostly to verify readme code np.set_printoptions(formatter={'float_kind': _fmt_float}) nqueries = 20 X, Q = _load_digits_X_Q(nqueries) enc = bolt.Encoder(accuracy='lowest', reduction=bolt.Reductions.DOT_PRODUCT) enc.fit(X) # massive space savings print("original space usage: {}B".format(X.nbytes)) # 1777 * 64 * 8B = 909KB print("bolt space usage: {}B".format(enc.nbytes)) # 1777 * 2B = 3.55KB # massive time savings (~10x here, but often >100x on larger datasets # with less Python overhead; see the Bolt paper) t_np = timeit.Timer(lambda: [np.dot(X, q) for q in Q]).timeit(5) # ~8ms t_bolt = timeit.Timer(lambda: [enc.transform(q) for q in Q]).timeit(5) # ~800us print("Numpy / BLAS time, Bolt time: {:.3f}ms, {:.3f}ms".format( t_np * 1000, t_bolt * 1000))
def test_basic(): # np.set_printoptions(precision=3) np.set_printoptions(formatter={'float_kind': _fmt_float}) nqueries = 20 # nqueries = 10 # nqueries = 3 X, Q = _load_digits_X_Q(nqueries) # TODO rm this block # shift = 100. # shift = 100 # scaleby = 1. # scaleby = 3.5 # acc goes to **** at accelerating rate as this gets larger... # scaleby = 4 # scaleby = 1.0 # X, Q = X + shift, Q + shift # X, Q = X * scaleby, Q * scaleby # X = X[:200] # X = X[:50] # X = X[:20] # X, _ = load_digits(return_X_y=True) # Q = X[-nqueries:] # X = X[:-nqueries] # print "X.shape", X.shape # print "X nbytes", X.nbytes # ------------------------------------------------ squared l2 enc = bolt.Encoder(accuracy='low', reduction=bolt.Reductions.SQUARED_EUCLIDEAN) enc.fit(X) l2_corrs = np.empty(len(Q)) for i, q in enumerate(Q): l2_true = _dists_sq(X, q).astype(np.int) l2_bolt = enc.transform(q) l2_corrs[i] = _corr(l2_true, l2_bolt) if i == nqueries - 1: print("l2 true: ", l2_true) print("l2 bolt: ", l2_bolt) print("corr: ", l2_corrs[i]) mean_l2 = np.mean(l2_corrs) std_l2 = np.std(l2_corrs) assert mean_l2 > .95 print("--> squared l2 dist correlation: {} +/- {}".format(mean_l2, std_l2)) # return # ------------------------------------------------ dot product enc = bolt.Encoder(accuracy='low', reduction=bolt.Reductions.DOT_PRODUCT) enc.fit(X) dot_corrs = np.empty(nqueries) for i, q in enumerate(Q): dots_true = np.dot(X, q) dots_bolt = enc.transform(q) dot_corrs[i] = _corr(dots_true, dots_bolt) mean_dot = np.mean(dot_corrs) std_dot = np.std(dot_corrs) assert mean_dot > .95 print("--> dot product correlation: {} +/- {}".format(mean_dot, std_dot)) # ------------------------------------------------ l2 knn enc = bolt.Encoder(accuracy='low', reduction='l2') enc.fit(X) k_bolt = 10 # tell bolt to search for true knn k_true = 10 # compute this many true neighbors true_knn = _knn(X, Q, k_true) bolt_knn = [enc.knn(q, k_bolt) for q in Q] contained = np.empty((nqueries, k_bolt), dtype=np.bool) for i in range(nqueries): true_neighbors = true_knn[i] bolt_neighbors = bolt_knn[i] for j in range(k_bolt): contained[i, j] = bolt_neighbors[j] in true_neighbors precision = np.mean(contained) print("--> l2 knn precision@{}: {}".format(k_bolt, precision)) assert precision > .6 # # print "true_knn, bolt_knn:" # # print true_knn[:20, :20] # # print bolt_knn[:20] # ------------------------------------------------ dot knn enc = bolt.Encoder(accuracy='low', reduction='dot') # enc = bolt.Encoder(accuracy='high', reduction='dot') enc.fit(X) k_bolt = 10 # tell bolt to search for true knn k_true = 10 # compute this many true neighbors true_dists = np.dot(X, Q.T) # true_dists = [np.dot(X, q) for q in Q] true_knn = np.empty((nqueries, k_true), dtype=np.int64) for i in range(nqueries): true_knn[i, :] = top_k_idxs(true_dists[:, i], k_true, smaller_better=False) bolt_knn = [enc.knn(q, k_bolt) for q in Q] contained = np.empty((len(Q), k_bolt), dtype=np.bool) for i in range(len(Q)): true_neighbors = true_knn[i] # bolt_dists = enc.transform(Q[i]) # bolt_neighbors = top_k_idxs(bolt_dists, k_bolt, smaller_better=True) bolt_neighbors = bolt_knn[i] # TODO uncomment for j in range(k_bolt): contained[i, j] = bolt_neighbors[j] in true_neighbors precision = np.mean(contained) print("--> max inner product knn precision@{}: {}".format( k_bolt, precision)) assert precision > .6