Пример #1
0
def main(id_path, emb_path, k):
    id_file_size = os.path.getsize(id_path)
    assert id_file_size % 8 == 0, \
        'Id file size is not a multiple of sizeof(u64)'
    n = int(id_file_size / 8)
    emb_file_size = os.path.getsize(emb_path)
    assert emb_file_size % 4 == 0, \
        'Embedding file size is a multiple of sizeof(f32)'
    d = int((emb_file_size / 4) / (id_file_size / 8))
    assert emb_file_size % d == 0, \
        'Embedding file size is a multiple of d={}'.format(d)

    print('Count:', n)
    print('Dimension:', d)

    emb_data = EmbeddingData(id_path, emb_path, d)
    assert emb_data.count() == n, \
        'Count does not match expected: {} != {}'.format(n, emb_data.count())

    print('Enter one or more ids (separated by ","s)')
    while True:
        line = input('> ').strip()
        if line == '':
            break
        try:
            ids = [int(i.strip()) for i in line.split(',')]
            search(ids, k, emb_data)
        except KeyboardInterrupt:
            break
        except Exception:
            traceback.print_exc()
Пример #2
0
def test_get():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    batch_result = emb_data.get(list(range(N)))
    assert len(batch_result) == N
    for i in range(N):
        j, v = emb_data.get([i])[0]
        assert j == i
        assert len(v) == DIM
        assert batch_result[i] == (i, v)
Пример #3
0
def test_nn_search():
    k = 1000
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    exemplar = [random.random() * 2 - 1. for i in range(N)]
    nn = emb_data.nn([exemplar], k, float('inf'))
    assert len(nn) == k
    assert all(d[1] >= 0 for d in nn)
    assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1))

    emb_data.nn([exemplar], k, float('inf'), sample=10)
Пример #4
0
def test_kmeans():
    k = 10
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    clusters = {}
    for i, c in emb_data.kmeans(list(range(N)), k):
        if c not in clusters:
            clusters[c] = []
        clusters[c].append(i)
    assert len(clusters) == k
    assert sum(len(v) for v in clusters.values()) == N
Пример #5
0
def test_dist():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    n1, n2 = 10, 100
    ids1 = list(range(n1))
    ids2 = list(range(n2))
    dists = emb_data.dist_by_id(ids1, ids2)
    assert len(dists) == len(ids2)
    assert np.allclose(dists[:n1], 0)

    embs1 = [v for _, v in emb_data.get(ids1)]
    dists2 = emb_data.dist(embs1, ids2)
    assert dists == dists2
Пример #6
0
def _load():
    id_file_size = os.path.getsize(ID_PATH)
    assert id_file_size % 8 == 0, \
        'Id file size is not a multiple of sizeof(u64)'
    n = int(id_file_size / 8)
    emb_file_size = os.path.getsize(POSE_PATH)
    assert emb_file_size % 4 == 0, \
        'Embedding file size is a multiple of sizeof(f32)'
    d = int((emb_file_size / 4) / (id_file_size / 8))
    assert emb_file_size % d == 0, \
        'Embedding file size is a multiple of d={}'.format(d)
    emb_data = EmbeddingData(ID_PATH, POSE_PATH, POSE_DIM)
    assert emb_data.count() == n, \
        'Count does not match expected: {} != {}'.format(n, emb_data.count())
    return emb_data
Пример #7
0
def test_knn():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    train_x = list(range(N))
    train_y = [float(i % 2) for i in range(N)]
    pred = emb_data.knn_predict(train_x,
                                train_y,
                                5,
                                min_thresh=-1,
                                max_thresh=2)
    assert len(pred) == N
    assert all(a >= 0. and a <= 1. for _, a in pred)
    # Make sure that the model does predict both classes
    assert sum(a > 0.5 for _, a in pred) > N / 4
    assert sum(a < 0.5 for _, a in pred) > N / 4

    emb_data.knn_predict(train_x, train_y, 5, sample=10)
Пример #8
0
def test_nn_search_by_id():
    k = 1000
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)

    nn = emb_data.nn_by_id([0], k, float('inf'))
    assert len(nn) == k
    assert all(d[1] >= 0 for d in nn)
    assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1))

    nn = emb_data.nn_by_id(list(range(25)), k, float('inf'))
    assert len(nn) == k
    assert all(d[1] >= 0 for d in nn)
    assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1))

    emb_data.nn_by_id(list(range(25)), k, float('inf'), sample=10)
Пример #9
0
def main(in_ids, in_embs, out_dir, db_name, db_user):
    password = os.getenv('POSTGRES_PASSWORD')
    session = util.get_db_session(db_user, password, db_name)

    # Only repackage embeddings for the old 3s data.
    frame_sampler = session.query(schema.FrameSampler).filter_by(
        name='3s'
    ).one()

    videos = list(session.query(schema.Video).filter(
        # TODO: we only want to repackage the embeddings for pre-2019
        schema.Video.time < datetime(2019, 1, 1)
    ).all())

    emb_data = EmbeddingData(in_ids, in_embs, EMBEDDING_DIM)

    os.makedirs(out_dir, exist_ok=True)
    for video in tqdm(videos):
        face_ids = get_face_ids(session, frame_sampler, video)
        if len(face_ids) > 0:
            write_emb_files(emb_data, out_dir, video, face_ids)
    print('Done!')
Пример #10
0
def test_logreg():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    train_x = list(range(N))
    train_y = [float(i % 2) for i in range(N)]
    weights = emb_data.logreg(train_x,
                              train_y,
                              num_epochs=20,
                              learning_rate=0.1,
                              l2_penalty=0.01,
                              l1_penalty=0.)

    pred1 = emb_data.logreg_predict(weights, min_thresh=-1, max_thresh=2)
    pred2 = emb_data.logreg_predict(weights, min_thresh=-1, max_thresh=2)
    for p1, p2 in zip(sorted(pred1), sorted(pred2)):
        i1, s1 = p1
        i2, s2 = p2
        assert i1 == i2 and np.isclose(s1, s2), \
            'Predictions from saved model do not match'

    emb_data.logreg_predict(weights, sample=10)
Пример #11
0
import os
import numpy as np
from rs_embed import EmbeddingData

EMB_DIR = '/app/data/embs'
EMB_PATH = os.path.join(EMB_DIR, 'face_embs.bin')
ID_PATH = os.path.join(EMB_DIR, 'face_ids.bin')
EMB_DIM = 128

_EMB_DATA = EmbeddingData(ID_PATH, EMB_PATH, EMB_DIM)


def count():
    return _EMB_DATA.count()


def ids(i, n):
    """Get n face ids starting at index i"""
    return _EMB_DATA.ids(i, n)


def get(ids):
    """List of face ids -> List of pairs (id, embedding)"""
    return _EMB_DATA.get(ids)


def mean(ids):
    """List of face ids -> mean embedding"""
    return _EMB_DATA.mean(ids)

Пример #12
0
def test_mean():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    mean = emb_data.mean(list(range(N)))
    # Dummy data is zero-mean
    assert np.allclose(mean, [0] * len(mean), rtol=0.01, atol=0.01)
Пример #13
0
def test_sample():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    for k in [0, 1, 2, 5, 10, 1000]:
        samples = emb_data.sample(k)
        assert len(samples) == k
        assert all(emb_data.exists(samples))
Пример #14
0
def test_exists():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    ids = [0, 1, 100, 9999, 10000, 10001, 100002]
    expected = [id < N for id in ids]
    assert emb_data.exists(ids) == expected
Пример #15
0
def test_count():
    emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM)
    assert emb_data.count() == N