def main(id_path, emb_path, k): id_file_size = os.path.getsize(id_path) assert id_file_size % 8 == 0, \ 'Id file size is not a multiple of sizeof(u64)' n = int(id_file_size / 8) emb_file_size = os.path.getsize(emb_path) assert emb_file_size % 4 == 0, \ 'Embedding file size is a multiple of sizeof(f32)' d = int((emb_file_size / 4) / (id_file_size / 8)) assert emb_file_size % d == 0, \ 'Embedding file size is a multiple of d={}'.format(d) print('Count:', n) print('Dimension:', d) emb_data = EmbeddingData(id_path, emb_path, d) assert emb_data.count() == n, \ 'Count does not match expected: {} != {}'.format(n, emb_data.count()) print('Enter one or more ids (separated by ","s)') while True: line = input('> ').strip() if line == '': break try: ids = [int(i.strip()) for i in line.split(',')] search(ids, k, emb_data) except KeyboardInterrupt: break except Exception: traceback.print_exc()
def test_get(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) batch_result = emb_data.get(list(range(N))) assert len(batch_result) == N for i in range(N): j, v = emb_data.get([i])[0] assert j == i assert len(v) == DIM assert batch_result[i] == (i, v)
def test_nn_search(): k = 1000 emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) exemplar = [random.random() * 2 - 1. for i in range(N)] nn = emb_data.nn([exemplar], k, float('inf')) assert len(nn) == k assert all(d[1] >= 0 for d in nn) assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1)) emb_data.nn([exemplar], k, float('inf'), sample=10)
def test_kmeans(): k = 10 emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) clusters = {} for i, c in emb_data.kmeans(list(range(N)), k): if c not in clusters: clusters[c] = [] clusters[c].append(i) assert len(clusters) == k assert sum(len(v) for v in clusters.values()) == N
def test_dist(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) n1, n2 = 10, 100 ids1 = list(range(n1)) ids2 = list(range(n2)) dists = emb_data.dist_by_id(ids1, ids2) assert len(dists) == len(ids2) assert np.allclose(dists[:n1], 0) embs1 = [v for _, v in emb_data.get(ids1)] dists2 = emb_data.dist(embs1, ids2) assert dists == dists2
def _load(): id_file_size = os.path.getsize(ID_PATH) assert id_file_size % 8 == 0, \ 'Id file size is not a multiple of sizeof(u64)' n = int(id_file_size / 8) emb_file_size = os.path.getsize(POSE_PATH) assert emb_file_size % 4 == 0, \ 'Embedding file size is a multiple of sizeof(f32)' d = int((emb_file_size / 4) / (id_file_size / 8)) assert emb_file_size % d == 0, \ 'Embedding file size is a multiple of d={}'.format(d) emb_data = EmbeddingData(ID_PATH, POSE_PATH, POSE_DIM) assert emb_data.count() == n, \ 'Count does not match expected: {} != {}'.format(n, emb_data.count()) return emb_data
def test_knn(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) train_x = list(range(N)) train_y = [float(i % 2) for i in range(N)] pred = emb_data.knn_predict(train_x, train_y, 5, min_thresh=-1, max_thresh=2) assert len(pred) == N assert all(a >= 0. and a <= 1. for _, a in pred) # Make sure that the model does predict both classes assert sum(a > 0.5 for _, a in pred) > N / 4 assert sum(a < 0.5 for _, a in pred) > N / 4 emb_data.knn_predict(train_x, train_y, 5, sample=10)
def test_nn_search_by_id(): k = 1000 emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) nn = emb_data.nn_by_id([0], k, float('inf')) assert len(nn) == k assert all(d[1] >= 0 for d in nn) assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1)) nn = emb_data.nn_by_id(list(range(25)), k, float('inf')) assert len(nn) == k assert all(d[1] >= 0 for d in nn) assert all(nn[i][1] <= nn[i + 1][1] for i in range(len(nn) - 1)) emb_data.nn_by_id(list(range(25)), k, float('inf'), sample=10)
def main(in_ids, in_embs, out_dir, db_name, db_user): password = os.getenv('POSTGRES_PASSWORD') session = util.get_db_session(db_user, password, db_name) # Only repackage embeddings for the old 3s data. frame_sampler = session.query(schema.FrameSampler).filter_by( name='3s' ).one() videos = list(session.query(schema.Video).filter( # TODO: we only want to repackage the embeddings for pre-2019 schema.Video.time < datetime(2019, 1, 1) ).all()) emb_data = EmbeddingData(in_ids, in_embs, EMBEDDING_DIM) os.makedirs(out_dir, exist_ok=True) for video in tqdm(videos): face_ids = get_face_ids(session, frame_sampler, video) if len(face_ids) > 0: write_emb_files(emb_data, out_dir, video, face_ids) print('Done!')
def test_logreg(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) train_x = list(range(N)) train_y = [float(i % 2) for i in range(N)] weights = emb_data.logreg(train_x, train_y, num_epochs=20, learning_rate=0.1, l2_penalty=0.01, l1_penalty=0.) pred1 = emb_data.logreg_predict(weights, min_thresh=-1, max_thresh=2) pred2 = emb_data.logreg_predict(weights, min_thresh=-1, max_thresh=2) for p1, p2 in zip(sorted(pred1), sorted(pred2)): i1, s1 = p1 i2, s2 = p2 assert i1 == i2 and np.isclose(s1, s2), \ 'Predictions from saved model do not match' emb_data.logreg_predict(weights, sample=10)
import os import numpy as np from rs_embed import EmbeddingData EMB_DIR = '/app/data/embs' EMB_PATH = os.path.join(EMB_DIR, 'face_embs.bin') ID_PATH = os.path.join(EMB_DIR, 'face_ids.bin') EMB_DIM = 128 _EMB_DATA = EmbeddingData(ID_PATH, EMB_PATH, EMB_DIM) def count(): return _EMB_DATA.count() def ids(i, n): """Get n face ids starting at index i""" return _EMB_DATA.ids(i, n) def get(ids): """List of face ids -> List of pairs (id, embedding)""" return _EMB_DATA.get(ids) def mean(ids): """List of face ids -> mean embedding""" return _EMB_DATA.mean(ids)
def test_mean(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) mean = emb_data.mean(list(range(N))) # Dummy data is zero-mean assert np.allclose(mean, [0] * len(mean), rtol=0.01, atol=0.01)
def test_sample(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) for k in [0, 1, 2, 5, 10, 1000]: samples = emb_data.sample(k) assert len(samples) == k assert all(emb_data.exists(samples))
def test_exists(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) ids = [0, 1, 100, 9999, 10000, 10001, 100002] expected = [id < N for id in ids] assert emb_data.exists(ids) == expected
def test_count(): emb_data = EmbeddingData(ID_PATH, DATA_PATH, DIM) assert emb_data.count() == N