Exemplo n.º 1
0
def test_unquantize():
    X, Q = _load_digits_X_Q(nqueries=20)
    enc = bolt.Encoder('dot', accuracy='high').fit(X)

    dots_true = [np.dot(X, q) for q in Q]
    dots_bolt = [enc.transform(q, unquantize=True) for q in Q]

    diffs = [
        true_vals - bolt_vals
        for true_vals, bolt_vals in zip(dots_true, dots_bolt)
    ]
    mse = np.mean([np.mean(diff * diff) for diff in diffs])
    var = np.mean([np.var(true_vals) for true_vals in dots_true])
    print("dot product unquantize mse / variance: ", mse / var)
    assert (mse / var) < .01

    # print "true, bolt dot prods"
    # print dots_true[0][:20].astype(np.int32)
    # print dots_bolt[0][:20].astype(np.int32)

    enc = bolt.Encoder('l2', accuracy='high').fit(X)
    dists_true = [_dists_sq(X, q) for q in Q]
    dists_bolt = [enc.transform(q, unquantize=True) for q in Q]

    diffs = [
        true_vals - bolt_vals
        for true_vals, bolt_vals in zip(dists_true, dists_bolt)
    ]
    mse = np.mean([np.mean(diff * diff) for diff in diffs])
    var = np.mean([np.var(true_vals) for true_vals in dots_true])
    print("squared l2 unquantize mse / variance: ", mse / var)
    assert (mse / var) < .01
Exemplo n.º 2
0
def _create_randn_encoder(Ntrain=100, Ntest=20, D=64):
    enc = bolt.Encoder()
    X_train = np.random.randn(Ntrain, D)
    X_test = np.random.randn(Ntest, D)
    enc.fit(X_train, just_train=True)
    enc.set_data(X_test)
    return enc
Exemplo n.º 3
0
def test_time_space_savings():  # mostly to verify readme code
    np.set_printoptions(formatter={'float_kind': _fmt_float})

    nqueries = 20
    X, Q = _load_digits_X_Q(nqueries)

    enc = bolt.Encoder(accuracy='lowest', reduction=bolt.Reductions.DOT_PRODUCT)
    enc.fit(X)

    # massive space savings
    print("original space usage: {}B".format(X.nbytes))  # 1777 * 64 * 8B = 909KB
    print("bolt space usage: {}B".format(enc.nbytes))  # 1777 * 2B = 3.55KB

    # massive time savings (~10x here, but often >100x on larger datasets
    # with less Python overhead; see the Bolt paper)
    t_np = timeit.Timer(lambda: [np.dot(X, q) for q in Q]).timeit(5)  # ~8ms
    t_bolt = timeit.Timer(lambda: [enc.transform(q) for q in Q]).timeit(5)  # ~800us
    print("Numpy / BLAS time, Bolt time: {:.3f}ms, {:.3f}ms".format(
        t_np * 1000, t_bolt * 1000))
Exemplo n.º 4
0
def test_basic():
    # np.set_printoptions(precision=3)
    np.set_printoptions(formatter={'float_kind': _fmt_float})

    nqueries = 20
    # nqueries = 10
    # nqueries = 3
    X, Q = _load_digits_X_Q(nqueries)

    # TODO rm this block
    # shift = 100.
    # shift = 100
    # scaleby = 1.
    # scaleby = 3.5  # acc goes to **** at accelerating rate as this gets larger...
    # scaleby = 4
    # scaleby = 1.0
    # X, Q = X + shift, Q + shift
    # X, Q = X * scaleby, Q * scaleby
    # X = X[:200]
    # X = X[:50]
    # X = X[:20]

    # X, _ = load_digits(return_X_y=True)
    # Q = X[-nqueries:]
    # X = X[:-nqueries]

    # print "X.shape", X.shape
    # print "X nbytes", X.nbytes

    # ------------------------------------------------ squared l2

    enc = bolt.Encoder(accuracy='low',
                       reduction=bolt.Reductions.SQUARED_EUCLIDEAN)
    enc.fit(X)

    l2_corrs = np.empty(len(Q))
    for i, q in enumerate(Q):
        l2_true = _dists_sq(X, q).astype(np.int)
        l2_bolt = enc.transform(q)
        l2_corrs[i] = _corr(l2_true, l2_bolt)
        if i == nqueries - 1:
            print("l2 true: ", l2_true)
            print("l2 bolt: ", l2_bolt)
            print("corr: ", l2_corrs[i])

    mean_l2 = np.mean(l2_corrs)
    std_l2 = np.std(l2_corrs)
    assert mean_l2 > .95
    print("--> squared l2 dist correlation: {} +/- {}".format(mean_l2, std_l2))

    # return

    # ------------------------------------------------ dot product

    enc = bolt.Encoder(accuracy='low', reduction=bolt.Reductions.DOT_PRODUCT)
    enc.fit(X)

    dot_corrs = np.empty(nqueries)
    for i, q in enumerate(Q):
        dots_true = np.dot(X, q)
        dots_bolt = enc.transform(q)
        dot_corrs[i] = _corr(dots_true, dots_bolt)

    mean_dot = np.mean(dot_corrs)
    std_dot = np.std(dot_corrs)
    assert mean_dot > .95
    print("--> dot product correlation: {} +/- {}".format(mean_dot, std_dot))

    # ------------------------------------------------ l2 knn

    enc = bolt.Encoder(accuracy='low', reduction='l2')
    enc.fit(X)

    k_bolt = 10  # tell bolt to search for true knn
    k_true = 10  # compute this many true neighbors
    true_knn = _knn(X, Q, k_true)
    bolt_knn = [enc.knn(q, k_bolt) for q in Q]

    contained = np.empty((nqueries, k_bolt), dtype=np.bool)
    for i in range(nqueries):
        true_neighbors = true_knn[i]
        bolt_neighbors = bolt_knn[i]
        for j in range(k_bolt):
            contained[i, j] = bolt_neighbors[j] in true_neighbors

    precision = np.mean(contained)
    print("--> l2 knn precision@{}: {}".format(k_bolt, precision))
    assert precision > .6

    # # print "true_knn, bolt_knn:"
    # # print true_knn[:20, :20]
    # # print bolt_knn[:20]

    # ------------------------------------------------ dot knn

    enc = bolt.Encoder(accuracy='low', reduction='dot')
    # enc = bolt.Encoder(accuracy='high', reduction='dot')
    enc.fit(X)

    k_bolt = 10  # tell bolt to search for true knn
    k_true = 10  # compute this many true neighbors
    true_dists = np.dot(X, Q.T)
    # true_dists = [np.dot(X, q) for q in Q]
    true_knn = np.empty((nqueries, k_true), dtype=np.int64)
    for i in range(nqueries):
        true_knn[i, :] = top_k_idxs(true_dists[:, i],
                                    k_true,
                                    smaller_better=False)
    bolt_knn = [enc.knn(q, k_bolt) for q in Q]

    contained = np.empty((len(Q), k_bolt), dtype=np.bool)
    for i in range(len(Q)):
        true_neighbors = true_knn[i]
        # bolt_dists = enc.transform(Q[i])
        # bolt_neighbors = top_k_idxs(bolt_dists, k_bolt, smaller_better=True)
        bolt_neighbors = bolt_knn[i]  # TODO uncomment
        for j in range(k_bolt):
            contained[i, j] = bolt_neighbors[j] in true_neighbors

    precision = np.mean(contained)
    print("--> max inner product knn precision@{}: {}".format(
        k_bolt, precision))
    assert precision > .6