def batch_features(x, batch_size=1): for i in range(0, x.shape[0], batch_size): xb = x[i:i + batch_size] xb = utils.compute_if_dask(xb, progress=False) for xi in xb: yield xi del xb
def features_to_str(x, batch_size=1, boost=False): for i in range(0, x.shape[0], batch_size): xb = x[i:i + batch_size] xb = utils.compute_if_dask(xb, progress=False) for xi in xb: yield surrogate_text(xi, boost=boost) del xb
def generate_index_actions(es, index_name, x, x_ids, thr, s, batch_size=1): for i in range(0, x.shape[0], batch_size): xb = x[i:i + batch_size] xb = thr_sq(xb, thr, s) xb = utils.compute_if_dask(xb, progress=False) id_b = x_ids[i:i + batch_size] for xi_id, xi in zip(id_b, xb): # if es.exists(index_name, xi_id): # tqdm.write(f'Skipping: {xi_id}') # continue yield {'_index': index_name, '_id': xi_id, 'repr': surrogate_text(xi)} del xb
def surrogate_text(x, boost=False): surrogate = [] x = utils.compute_if_dask(x, progress=False) for term, freq in enumerate(x): if freq: if boost: surrogate.append('{}^{}'.format(str(term), freq)) else: try: surrogate.extend([str(term)] * freq) except: print(freq, type(freq)) return ' '.join(surrogate)
def main(args): dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(2500, 2048)) x = utils.load_features(x, chunks=(2500, 2048)) if args.limit: x = x[:args.limit] n_points, dim = x.shape if args.n_cells is None: step_k = 2500 min_points_per_centroid = 39.0 max_points_per_centroid = 256.0 # n_train_points = min(n_points, 120000) # train index with less points or it crashes.. min_k = np.ceil( n_points / (step_k * max_points_per_centroid)).astype(int) * step_k max_k = np.floor( n_points / (step_k * min_points_per_centroid)).astype(int) * step_k args.n_cells = min_k print('Using min suggested cells:', args.n_cells) exp = Experiment(args, root=args.output, ignore=('output', 'pretrained')) print(exp) # create or load faiss index index_file = exp.path_to('index.faiss') if not os.path.exists(index_file): if args.pretrained: print('Loading pre-trained empty index ...') index = faiss.read_index(args.pretrained) train_time = None else: tmp = utils.compute_if_dask(x) print('Creating index: training ...') index = faiss.index_factory( dim, 'IVF{},PQ{}'.format(args.n_cells, args.code_size)) # index = faiss.index_factory(dim, 'IVF{},Flat'.format(args.n_cells)) start = time.time() index.train(tmp) train_time = time.time() - start del tmp print('Creating index: adding ...') start = time.time() bs = 2**14 for i in trange(0, x.shape[0], bs): batch = utils.compute_if_dask(x[i:i + bs]) index.add(batch) add_time = time.time() - start faiss.write_index(index, index_file) size = os.path.getsize(index_file) index_stats_file = exp.path_to('index_stats.csv') index_stats = pd.DataFrame( { 'size': size, 'train_time': train_time, 'add_time': add_time }, index=[0]) index_stats.to_csv(index_stats_file, index=False) else: print('Loading pre-built index ...') index = faiss.read_index(index_file) n_probes = (1, 2, 5, 10, 25) # , 50, 100, 250, 500, 1000, 2500, 5000) n_probes = filter(lambda x: x <= args.n_cells, n_probes) params = vars(args) progress = tqdm(n_probes) for p in progress: index.nprobe = p params['nprobe'] = p progress.set_postfix( {k: v for k, v in params.items() if k != 'output'}) scores = None scores_file = exp.path_to(f'scores_np{p}.h5') if not os.path.exists(scores_file): print('Computing scores:', scores_file) q = utils.compute_if_dask(q) # execute kNN search using k = dataset size ranked_sim, ranked_ids = index.search(q, n_points) # we need a similarity matrix, we construct it from the ranked results. # we fill it initially with the lowest score (not recovered IDs has infinity score) if False: # XXX OPTIMIZED VERSION NOT WORKING!!!! ranked_ids = np.ma.array(ranked_ids, mask=(ranked_ids < 0)) id_order = ranked_ids.argsort(axis=1) scores = -ranked_sim[np.arange(q.shape[0]).reshape(-1, 1), id_order] del ranked_sim, ranked_ids, id_order else: scores = np.full((q.shape[0], n_points), np.inf) for i, (rsims, rids) in enumerate(zip(ranked_sim, ranked_ids)): for rsim, rid in zip(rsims, rids): if rid > 0: scores[i, rid] = rsim scores = -scores utils.save_as_hdf5(scores, scores_file, progress=True) query_times, query_times_file = exp.require_csv('query_times.csv', index='n_probes') for i in trange(1, 6): if utils.value_missing(query_times, p, f'query_time_run{i}'): q = utils.compute_if_dask(q) start = time.time() index.search(q, n_points) query_time = time.time() - start query_times.at[p, f'query_time_run{i}'] = query_time query_times.to_csv(query_times_file) metrics, metrics_file = exp.require_csv(f'metrics_np{p}.csv') if 'ap' not in metrics: if scores is None: print('Loading scores...') scores = utils.load_features(scores_file) print('Computing mAP...') metrics['ap'] = dataset.score(scores[...], reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: print('Loading scores...') scores = utils.load_features(scores_file) print('Computing nDCG...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 5 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True)) ndcg = np.concatenate(ndcg) # metrics['ndcg'] = dcg(y_true, scores, normalized=True) metrics['ndcg'] = ndcg metrics.to_csv(metrics_file, index=False) if 'ndcg@25' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG@25...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], p=25, normalized=True)) metrics['ndcg@25'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}') metrics['n_probes'] = p metrics.to_csv(metrics_file, index=False)
def main(args): es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True) dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(5000, 2048)) x = utils.load_features(x, chunks=(5000, 2048)) n_queries, n_samples = q.shape[0], x.shape[0] if args.limit: x = x[:args.limit] if args.crelu: q = crelu(q) x = crelu(x) params = vars(args) ignore = ('output', 'force') progress = tqdm(zip(args.threshold, args.sq_factor), total=len(args.threshold)) for thr, s in progress: params['threshold'] = thr params['sq_factor'] = s progress.set_postfix({k: v for k, v in params.items() if k not in ignore}) exp = Experiment(params, root=args.output, ignore=ignore) density, density_file = exp.require_csv(f'density.csv') if 'query_density' not in density: progress.write('Computing query density ...') q_sq = thr_sq(q, thr, s) q_density = (q_sq != 0).mean(axis=0) q_density = utils.compute_if_dask(q_density) density['query_density'] = q_density density.to_csv(density_file, index=False) if 'database_density' not in density: progress.write('Computing database density ...') x_sq = thr_sq(x, thr, s) x_density = (x_sq != 0).mean(axis=0) x_density = utils.compute_if_dask(x_density) density['database_density'] = x_density density.to_csv(density_file, index=False) index_name = exp.name.lower() if not es.indices.exists(index_name) or es.count(index=index_name)['count'] < n_samples or args.force: # x_sq = thr_sq(x, thr, s) x_ids, _ = dataset.images() index_actions = generate_index_actions(es, index_name, x, x_ids, thr, s, 50) # index_actions = tqdm(index_actions, total=n_samples) progress.write(f'Indexing: {index_name}') index_config = { "mappings": { "_source": {"enabled": False}, # do not store STR "properties": {"repr": {"type": "text"}} # FULLTEXT }, "settings": { "index": {"number_of_shards": 1, "number_of_replicas": 0}, "analysis": {"analyzer": {"first": {"type": "whitespace"}}} } } # es.indices.delete(index_name, ignore=(400, 404)) es.indices.create(index_name, index_config, ignore=400) es.indices.put_settings({"index": {"refresh_interval": "-1", "number_of_replicas": 0}}, index_name) indexing = parallel_bulk(es, index_actions, thread_count=4, chunk_size=150, max_chunk_bytes=2**26) indexing = tqdm(indexing, total=n_samples) start = time.time() deque(indexing, maxlen=0) add_time = time.time() - start progress.write(f'Index time: {add_time}') es.indices.put_settings({"index": {"refresh_interval": "1s"}}, index_name) es.indices.refresh() index_stats_file = exp.path_to('index_stats.csv') index_stats = pd.DataFrame({'add_time': add_time}, index=[0]) index_stats.to_csv(index_stats_file, index=False) metrics, metrics_file = exp.require_csv(f'metrics.csv') scores = None scores_file = exp.path_to(f'scores.h5') if not os.path.exists(scores_file): progress.write('Computing scores...') xid2idx = {k: i for i, k in enumerate(dataset.images()[0])} q_sq = thr_sq(q, thr, s) q_sq = utils.compute_if_dask(q_sq, progress=False) scores = np.zeros((n_queries, n_samples), dtype=np.float32) query_times = [] for i, qi in enumerate(tqdm(q_sq)): query = { "query": {"query_string": {"default_field": "repr", "query": surrogate_text(qi, boost=True)}}, # "from": 0, "size": n_samples } start = time.time() for hit in tqdm(scan(es, query, index=index_name, preserve_order=True), total=n_samples): j = xid2idx[hit['_id']] scores[i, j] = hit['_score'] query_times.append(time.time() - start) metrics['query_time'] = query_times metrics.to_csv(metrics_file, index=False) progress.write(f'Query time: {metrics.query_time.sum()}') utils.save_as_hdf5(scores, scores_file, progress=True) if 'ap' not in metrics: if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing mAP...') metrics['ap'] = dataset.score(scores, reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) progress.write(f'mAP: {metrics.ap.mean()}') if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG...') metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG: {metrics.ndcg.mean()}')
def main(args): lucene_vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm.attachCurrentThread() dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(5000, 2048)) x = utils.load_features(x, chunks=(5000, 2048)) if args.limit: x = x[:args.limit] n_queries, n_samples = q.shape[0], x.shape[0] if args.crelu: q = crelu(q) x = crelu(x) params = vars(args) ignore = ('output', 'force') progress = tqdm(zip(args.threshold, args.sq_factor), total=len(args.threshold)) for thr, s in progress: params['threshold'] = thr params['sq_factor'] = s progress.set_postfix( {k: v for k, v in params.items() if k not in ignore}) exp = Experiment(params, root=args.output, ignore=ignore) density, density_file = exp.require_csv(f'density.csv') if 'query_density' not in density: progress.write('Computing query density ...') q_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(q) else q q_sq = threshold(q_re, thr, s) q_density = (q_sq != 0).mean(axis=0) q_density = utils.compute_if_dask(q_density) density['query_density'] = q_density density.to_csv(density_file, index=False) if 'database_density' not in density: progress.write('Computing database density ...') x_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(x) else x x_sq = threshold(x_re, thr, s) x_density = (x_sq != 0).mean(axis=0) x_density = utils.compute_if_dask(x_density) density['database_density'] = x_density density.to_csv(density_file, index=False) index_stats, index_stats_file = exp.require_csv('index_stats.csv') index_name = exp.name.lower() index_path = exp.path_to('lucene_index') with LuceneIndex(index_path) as idx: if idx.count() < n_samples: x_sq = threshold(x, thr, s) x_sq = batch_features(x_sq, 5000) # x_str = features_to_str(x_sq, 5000) progress.write(f'Indexing: {index_name}') start = time.time() for i, xi in enumerate(tqdm(x_sq, total=n_samples)): idx.add(str(i), xi) add_time = time.time() - start progress.write(f'Index time: {add_time}') index_stats.at[0, 'add_time'] = add_time if 'size' not in index_stats.columns: index_stats.at[0, 'size'] = utils.get_folder_size(index_path) index_stats.to_csv(index_stats_file, index=False) metrics, metrics_file = exp.require_csv(f'metrics.csv') scores = None scores_file = exp.path_to(f'scores.h5') if not os.path.exists(scores_file): progress.write('Computing scores...') q_sq = threshold(q, thr, s) q_sq = utils.compute_if_dask(q_sq, progress=False) # q_str = features_to_str(q_sq, n_queries, boost=True) scores = np.zeros((n_queries, n_samples), dtype=np.float32) query_times = [] if True: # sequential version for i, qi in enumerate(tqdm(q_sq, total=n_queries)): start = time.time() if qi.any(): for j, score in tqdm(idx.query(qi, n_samples), total=n_samples): scores[i, int(j)] = score query_times.append(time.time() - start) else: query_times.append(None) else: # Parallel version (currently slower) idx._init_searcher() def _search(i, qi): lucene_vm.attachCurrentThread() scores_i = np.zeros(n_samples, dtype=np.float32) start = time.time() if qi.any(): for j, score in idx.query(qi, n_samples): scores_i[int(j)] = score query_time = time.time() - start else: query_time = None return scores_i, query_time queries = enumerate(tqdm(q_sq, total=n_queries)) scores_n_times = Parallel(n_jobs=6, prefer="threads")( delayed(_search)(i, qi) for i, qi in queries) scores, query_times = zip(*scores_n_times) scores = np.vstack(scores) metrics['query_time'] = query_times metrics.to_csv(metrics_file, index=False) progress.write(f'Query time: {metrics.query_time.sum()}') utils.save_as_hdf5(scores, scores_file, progress=True) if 'ap' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing mAP...') metrics['ap'] = dataset.score(scores, reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) progress.write(f'mAP: {metrics.ap.mean()}') if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True)) metrics['ndcg'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG: {metrics.ndcg.mean()}') if 'ndcg@25' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG@25...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], p=25, normalized=True)) metrics['ndcg@25'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}')
dataset, q, x = utils.load_benchmark(args.dataset, args.features) x = utils.load_features(x, chunks=(1000, 2048)) q = utils.load_features(q, chunks=(1000, 2048)) x /= da.sqrt((x**2).sum(axis=1, keepdims=True)) q /= da.sqrt((q**2).sum(axis=1, keepdims=True)) if args.rotate: R = np.load(args.rotate) q = q.dot(R.T) x = x.dot(R.T) x -= x.mean(axis=0) scores = q.dot(x.T) scores = utils.compute_if_dask(scores) dataset._load() mean_ap = dataset.score(scores) print(mean_ap) """ CONFIRMED THAT compute_ap WORKS eval_bin = 'eval_bin/compute_ap' aps = [] for i, scores_i in enumerate(tqdm(scores)): tmp_rnk = f'tmp/{dataset.query_ids[i]}.rnk' rank = scores_i.argsort()[::-1] with open(tmp_rnk, 'w') as f: f.write('\n'.join(dataset.image_ids[rank]))