def main(args): # do not track lambda param, it can be changed after train exp = Experiment(args, ignore=('lambda_', )) print(exp) if exp.found: print('Already exists: SKIPPING') exit(0) np.random.seed(args.seed) tf.random.set_seed(args.seed) # get data train_dataset = get_train_data(args.category, image_size=args.image_size, patch_size=args.patch_size, batch_size=args.batch_size, n_batches=args.n_batches, rotation_range=args.rotation_range, seed=args.seed) test_dataset, test_labels = get_test_data(args.category, image_size=args.image_size, patch_size=args.patch_size, batch_size=args.batch_size) is_object = args.category in objects # build models generator = make_generator(args.latent_size, channels=args.channels, upsample_first=is_object, upsample_type=args.ge_up, bn=args.ge_bn, act=args.ge_act) encoder = make_encoder(args.patch_size, args.latent_size, channels=args.channels, bn=args.ge_bn, act=args.ge_act) discriminator = make_discriminator(args.patch_size, args.latent_size, channels=args.channels, bn=args.d_bn, act=args.d_act) # feature extractor model for evaluation discriminator_features = get_discriminator_features_model(discriminator) # build optimizers generator_encoder_optimizer = O.Adam(args.lr, beta_1=args.ge_beta1, beta_2=args.ge_beta2) discriminator_optimizer = O.Adam(args.lr, beta_1=args.d_beta1, beta_2=args.d_beta2) # reference to the models to use in eval generator_eval = generator encoder_eval = encoder # for smoothing generator and encoder evolution if args.ge_decay > 0: ema = tf.train.ExponentialMovingAverage(decay=args.ge_decay) generator_ema = tf.keras.models.clone_model(generator) encoder_ema = tf.keras.models.clone_model(encoder) generator_eval = generator_ema encoder_eval = encoder_ema # checkpointer checkpoint = tf.train.Checkpoint( generator=generator, encoder=encoder, discriminator=discriminator, generator_encoder_optimizer=generator_encoder_optimizer, discriminator_optimizer=discriminator_optimizer) best_ckpt_path = exp.ckpt(f'ckpt_{args.category}_best') last_ckpt_path = exp.ckpt(f'ckpt_{args.category}_last') # log stuff log, log_file = exp.require_csv(f'log_{args.category}.csv.gz') metrics, metrics_file = exp.require_csv(f'metrics_{args.category}.csv') best_metric = 0. best_recon = float('inf') best_recon_file = exp.path_to(f'best_recon_{args.category}.png') last_recon_file = exp.path_to(f'last_recon_{args.category}.png') # animate generation during training n_preview = 6 train_batch = next(iter(train_dataset))[:n_preview] test_batch = next(iter(test_dataset))[0][:n_preview] latent_batch = tf.random.normal([n_preview, args.latent_size]) if not is_object: # take random patches from test images patch_location = np.random.randint(0, args.image_size - args.patch_size, (n_preview, 2)) test_batch = [ x[i:i + args.patch_size, j:j + args.patch_size, :] for x, (i, j) in zip(test_batch, patch_location) ] test_batch = K.stack(test_batch) video_out = exp.path_to(f'{args.category}.mp4') video_options = dict(fps=30, codec='libx265', quality=4) # see imageio FFMPEG options video_saver = VideoSaver(train_batch, test_batch, latent_batch, video_out, **video_options) video_saver.generate_and_save(generator, encoder) # train loop progress = tqdm(train_dataset, desc=args.category, dynamic_ncols=True) try: for step, image_batch in enumerate(progress, start=1): if step == 1 or args.d_iter == 0: # only for JIT compilation (tf.function) to work d_train = True ge_train = True elif args.d_iter: n_iter = step % (abs(args.d_iter) + 1) # can be in [0, d_iter] d_train = (n_iter != 0) if (args.d_iter > 0) else ( n_iter == 0) # True in [1, d_iter] ge_train = not d_train # True when step == d_iter + 1 else: # d_iter == None: dynamic adjustment d_train = (scores['fake_score'] > 0) or (scores['real_score'] < 0) ge_train = (scores['real_score'] > 0) or (scores['fake_score'] < 0) losses, scores = train_step(image_batch, generator, encoder, discriminator, generator_encoder_optimizer, discriminator_optimizer, d_train, ge_train, alpha=args.alpha, gp_weight=args.gp_weight) if (args.ge_decay > 0) and (step % 10 == 0): ge_vars = generator.variables + encoder.variables ema.apply(ge_vars) # update exponential moving average # tensor to numpy losses = { n: l.numpy() if l is not None else l for n, l in losses.items() } scores = { n: s.numpy() if s is not None else s for n, s in scores.items() } # log step metrics entry = { 'step': step, 'timestamp': pd.to_datetime('now'), **losses, **scores } log = log.append(entry, ignore_index=True) if step % 100 == 0: if args.ge_decay > 0: ge_ema_vars = generator_ema.variables + encoder_ema.variables for v_ema, v in zip(ge_ema_vars, ge_vars): v_ema.assign(ema.average(v)) preview = video_saver.generate_and_save( generator_eval, encoder_eval) if step % 1000 == 0: log.to_csv(log_file, index=False) checkpoint.write(file_prefix=last_ckpt_path) auc, balanced_accuracy = evaluate(generator_eval, encoder_eval, discriminator_features, test_dataset, test_labels, patch_size=args.patch_size, lambda_=args.lambda_) entry = { 'step': step, 'auc': auc, 'balanced_accuracy': balanced_accuracy } metrics = metrics.append(entry, ignore_index=True) metrics.to_csv(metrics_file, index=False) if auc > best_metric: best_metric = auc checkpoint.write(file_prefix=best_ckpt_path) # save last image to inspect it during training imageio.imwrite(last_recon_file, preview) recon = losses['images_reconstruction_loss'] if recon < best_recon: best_recon = recon imageio.imwrite(best_recon_file, preview) progress.set_postfix({ 'AUC': f'{auc:.1%}', 'BalAcc': f'{balanced_accuracy:.1%}', 'BestAUC': f'{best_metric:.1%}', }) except KeyboardInterrupt: checkpoint.write(file_prefix=last_ckpt_path) finally: log.to_csv(log_file, index=False) video_saver.close() # score the test set checkpoint.read(best_ckpt_path) auc, balanced_accuracy = evaluate(generator, encoder, discriminator_features, test_dataset, test_labels, patch_size=args.patch_size, lambda_=args.lambda_) print(f'{args.category}: AUC={auc}, BalAcc={balanced_accuracy}')
def main(args): dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(2500, 2048)) x = utils.load_features(x, chunks=(2500, 2048)) if args.limit: x = x[:args.limit] n_points, dim = x.shape if args.n_cells is None: step_k = 2500 min_points_per_centroid = 39.0 max_points_per_centroid = 256.0 # n_train_points = min(n_points, 120000) # train index with less points or it crashes.. min_k = np.ceil( n_points / (step_k * max_points_per_centroid)).astype(int) * step_k max_k = np.floor( n_points / (step_k * min_points_per_centroid)).astype(int) * step_k args.n_cells = min_k print('Using min suggested cells:', args.n_cells) exp = Experiment(args, root=args.output, ignore=('output', 'pretrained')) print(exp) # create or load faiss index index_file = exp.path_to('index.faiss') if not os.path.exists(index_file): if args.pretrained: print('Loading pre-trained empty index ...') index = faiss.read_index(args.pretrained) train_time = None else: tmp = utils.compute_if_dask(x) print('Creating index: training ...') index = faiss.index_factory( dim, 'IVF{},PQ{}'.format(args.n_cells, args.code_size)) # index = faiss.index_factory(dim, 'IVF{},Flat'.format(args.n_cells)) start = time.time() index.train(tmp) train_time = time.time() - start del tmp print('Creating index: adding ...') start = time.time() bs = 2**14 for i in trange(0, x.shape[0], bs): batch = utils.compute_if_dask(x[i:i + bs]) index.add(batch) add_time = time.time() - start faiss.write_index(index, index_file) size = os.path.getsize(index_file) index_stats_file = exp.path_to('index_stats.csv') index_stats = pd.DataFrame( { 'size': size, 'train_time': train_time, 'add_time': add_time }, index=[0]) index_stats.to_csv(index_stats_file, index=False) else: print('Loading pre-built index ...') index = faiss.read_index(index_file) n_probes = (1, 2, 5, 10, 25) # , 50, 100, 250, 500, 1000, 2500, 5000) n_probes = filter(lambda x: x <= args.n_cells, n_probes) params = vars(args) progress = tqdm(n_probes) for p in progress: index.nprobe = p params['nprobe'] = p progress.set_postfix( {k: v for k, v in params.items() if k != 'output'}) scores = None scores_file = exp.path_to(f'scores_np{p}.h5') if not os.path.exists(scores_file): print('Computing scores:', scores_file) q = utils.compute_if_dask(q) # execute kNN search using k = dataset size ranked_sim, ranked_ids = index.search(q, n_points) # we need a similarity matrix, we construct it from the ranked results. # we fill it initially with the lowest score (not recovered IDs has infinity score) if False: # XXX OPTIMIZED VERSION NOT WORKING!!!! ranked_ids = np.ma.array(ranked_ids, mask=(ranked_ids < 0)) id_order = ranked_ids.argsort(axis=1) scores = -ranked_sim[np.arange(q.shape[0]).reshape(-1, 1), id_order] del ranked_sim, ranked_ids, id_order else: scores = np.full((q.shape[0], n_points), np.inf) for i, (rsims, rids) in enumerate(zip(ranked_sim, ranked_ids)): for rsim, rid in zip(rsims, rids): if rid > 0: scores[i, rid] = rsim scores = -scores utils.save_as_hdf5(scores, scores_file, progress=True) query_times, query_times_file = exp.require_csv('query_times.csv', index='n_probes') for i in trange(1, 6): if utils.value_missing(query_times, p, f'query_time_run{i}'): q = utils.compute_if_dask(q) start = time.time() index.search(q, n_points) query_time = time.time() - start query_times.at[p, f'query_time_run{i}'] = query_time query_times.to_csv(query_times_file) metrics, metrics_file = exp.require_csv(f'metrics_np{p}.csv') if 'ap' not in metrics: if scores is None: print('Loading scores...') scores = utils.load_features(scores_file) print('Computing mAP...') metrics['ap'] = dataset.score(scores[...], reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: print('Loading scores...') scores = utils.load_features(scores_file) print('Computing nDCG...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 5 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True)) ndcg = np.concatenate(ndcg) # metrics['ndcg'] = dcg(y_true, scores, normalized=True) metrics['ndcg'] = ndcg metrics.to_csv(metrics_file, index=False) if 'ndcg@25' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG@25...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], p=25, normalized=True)) metrics['ndcg@25'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}') metrics['n_probes'] = p metrics.to_csv(metrics_file, index=False)
def main(args): exp = Experiment.from_dir(args.run, main='model') params = next(exp.params.itertuples()) # data setup transform = transforms.Compose( [transforms.ToTensor(), transforms.Lambda(lambda x: x.numpy())]) preproc = utils.PREPROC[params.dataset] if params.dataset == 'mnist': data = MNIST('data/mnist', download=True, train=False, transform=transform) elif params.dataset == 'cifar10': data = CIFAR10('data/cifar10', download=True, train=False, transform=transform) preproc = map(lambda x: np.array(x).reshape((3, 1, 1)), preproc) # expand dimensions preproc = tuple(preproc) # model setup model = utils.load_model(exp).eval().cuda() if args.tol is None: args.tol = params.tol if params.model == 'odenet': model.odeblock.tol = args.tol fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=10, preprocessing=preproc) # attack setup if args.distance == 2: attack = foolbox.attacks.L2BasicIterativeAttack distance = foolbox.distances.MSE elif args.distance == float('inf'): attack = foolbox.attacks.LinfinityBasicIterativeAttack distance = foolbox.distances.Linf attack = attack(fmodel, distance=distance) sub_exp_root = exp.path_to('adv-attack') os.makedirs(sub_exp_root, exist_ok=True) sub_exp = Experiment(args, root=sub_exp_root, ignore=('run', )) print(sub_exp) results_file = sub_exp.path_to('results.csv') results = pd.read_csv(results_file) if os.path.exists( results_file) else pd.DataFrame() # perform attack progress = tqdm(data) for i, (image, label) in enumerate(progress): if not results.empty and i in results.sample_id.values: continue if not isinstance(label, int): label = label.item() start = time.time() adversarial = attack(image, label, unpack=False, binary_search=False, stepsize=args.stepsize, epsilon=args.epsilon) elapsed = time.time() - start result = pd.DataFrame(dict( sample_id=i, label=label, elapsed_time=elapsed, distance=adversarial.distance.value, adversarial_class=adversarial.adversarial_class, original_class=adversarial.original_class, ), index=[0]) results = results.append(result, ignore_index=True) results.to_csv(results_file, index=False) success = ~results.adversarial_class.isna() successes = success.sum() success_rate = success.mean() progress.set_postfix({ 'success_rate': f'{success_rate:.2%} ({successes}/{len(success)})' })
def main(args): exp = Experiment(args, ignore=('epochs', 'resume')) print(exp) np.random.seed(args.seed) tf.random.set_seed(args.seed) data = load_datasets(args.data) # TRAIN/VAL/TEST SPLIT if args.split == 'subjects': # by SUBJECTS val_subjects = (6, 9, 11, 13, 16, 28, 30, 48, 49) test_subjects = (3, 4, 19, 38, 45, 46, 51, 52) train_data = data[~data['sub'].isin(val_subjects + test_subjects)] val_data = data[data['sub'].isin(val_subjects)] test_data = data[data['sub'].isin(test_subjects)] elif args.split == 'random': # 70-20-10 % train_data, valtest_data = train_test_split(data, test_size=.3, shuffle=True) val_data, test_data = train_test_split(valtest_data, test_size=.33) lengths = map(len, (data, train_data, val_data, test_data)) print("Total: {} - Train / Val / Test: {} / {} / {}".format(*lengths)) x_shape = (args.resolution, args.resolution, 1) y_shape = (args.resolution, args.resolution, 1) train_gen, _ = get_loader(train_data, batch_size=args.batch_size, shuffle=True, augment=True, x_shape=x_shape) val_gen, val_categories = get_loader(val_data, batch_size=args.batch_size, x_shape=x_shape) # test_gen, test_categories = get_loader(test_data, batch_size=1, x_shape=x_shape) log = exp.path_to('log.csv') # weights_only checkpoints best_weights_path = exp.path_to('best_weights.h5') best_mask_weights_path = exp.path_to('best_weights_mask.h5') # whole model checkpoints best_ckpt_path = exp.path_to('best_model.h5') last_ckpt_path = exp.path_to('last_model.h5') if args.resume and os.path.exists(last_ckpt_path): custom_objects = { 'AdaBeliefOptimizer': AdaBeliefOptimizer, 'iou_coef': evaluate.iou_coef, 'dice_coef': evaluate.dice_coef, 'hard_swish': hard_swish } model = tf.keras.models.load_model(last_ckpt_path, custom_objects=custom_objects) optimizer = model.optimizer initial_epoch = len(pd.read_csv(log)) else: config = vars(args) model = build_model(x_shape, y_shape, config) optimizer = AdaBeliefOptimizer(learning_rate=args.lr, print_change_log=False) initial_epoch = 0 model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics={ 'mask': [evaluate.iou_coef, evaluate.dice_coef], 'tags': 'binary_accuracy' }) model_stopped_file = exp.path_to('early_stopped.txt') need_training = not os.path.exists( model_stopped_file) and initial_epoch < args.epochs if need_training: best_checkpointer = ModelCheckpoint(best_weights_path, monitor='val_loss', save_best_only=True, save_weights_only=True) best_mask_checkpointer = ModelCheckpoint(best_mask_weights_path, monitor='val_mask_dice_coef', mode='max', save_best_only=True, save_weights_only=True) last_checkpointer = ModelCheckpoint(last_ckpt_path, save_best_only=False, save_weights_only=False) logger = CSVLogger(log, append=args.resume) progress = TqdmCallback(verbose=1, initial=initial_epoch, dynamic_ncols=True) early_stop = tf.keras.callbacks.EarlyStopping( monitor='val_mask_dice_coef', mode='max', patience=100) callbacks = [ best_checkpointer, best_mask_checkpointer, last_checkpointer, logger, progress, early_stop ] model.fit(train_gen, epochs=args.epochs, callbacks=callbacks, initial_epoch=initial_epoch, steps_per_epoch=len(train_gen), validation_data=val_gen, validation_steps=len(val_gen), verbose=False) if model.stop_training: open(model_stopped_file, 'w').close() tf.keras.models.save_model(model, best_ckpt_path, include_optimizer=False) # evaluation on test set evaluate.evaluate(exp, force=need_training) # save best snapshot in SavedModel format model.load_weights(best_mask_weights_path) best_savedmodel_path = exp.path_to('best_savedmodel') model.save(best_savedmodel_path, save_traces=True) # export to tfjs (Layers model) tfjs_model_dir = exp.path_to('tfjs') tfjs.converters.save_keras_model(model, tfjs_model_dir)
def main(args): root = 'runs_' + args.dataset exp = Experiment(args, root=root, main='model', ignore=('cuda', 'device', 'epochs', 'resume')) print(exp) if os.path.exists(exp.path_to('log')) and not args.resume: print('Skipping ...') sys.exit(0) train_data, test_data, in_ch, out = load_dataset(args) train_loader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) test_loader = DataLoader(test_data, batch_size=args.batch_size, shuffle=False) if args.model == 'odenet': model = ODENet(in_ch, out=out, n_filters=args.filters, downsample=args.downsample, method=args.method, tol=args.tol, adjoint=args.adjoint, dropout=args.dropout) else: model = ResNet(in_ch, out=out, n_filters=args.filters, downsample=args.downsample, dropout=args.dropout) model = model.to(args.device) if args.optim == 'sgd': optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.wd) elif args.optim == 'adam': optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) # print(train_data) # print(test_data) # print(model) # print(optimizer) if args.resume: ckpt = torch.load(exp.ckpt('last')) print('Loaded: {}'.format(exp.ckpt('last'))) model.load_state_dict(ckpt['model']) optimizer.load_state_dict(ckpt['optim']) start_epoch = ckpt['epoch'] + 1 best_accuracy = exp.log['test_acc'].max() print('Resuming from epoch {}: {}'.format(start_epoch, exp.name)) else: metrics = evaluate(test_loader, model, args) best_accuracy = metrics['test_acc'] start_epoch = 1 if args.lrschedule == 'fixed': scheduler = LambdaLR( optimizer, lr_lambda=lambda x: 1) # no-op scheduler, just for cleaner code elif args.lrschedule == 'plateau': scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=args.patience) elif args.lrschedule == 'cosine': scheduler = CosineAnnealingLR(optimizer, args.lrcycle, last_epoch=start_epoch - 2) progress = trange(start_epoch, args.epochs + 1, initial=start_epoch, total=args.epochs) for epoch in progress: metrics = {'epoch': epoch} progress.set_postfix({'Best ACC': f'{best_accuracy:.2%}'}) progress.set_description('TRAIN') train_metrics = train(train_loader, model, optimizer, args) progress.set_description('EVAL') test_metrics = evaluate(test_loader, model, args) is_best = test_metrics['test_acc'] > best_accuracy best_accuracy = max(test_metrics['test_acc'], best_accuracy) metrics.update(train_metrics) metrics.update(test_metrics) save_checkpoint( exp, { 'epoch': epoch, 'params': vars(args), 'model': model.state_dict(), 'optim': optimizer.state_dict(), 'metrics': metrics }, is_best) exp.push_log(metrics) sched_args = metrics[ 'test_acc'] if args.lrschedule == 'plateau' else None scheduler.step(sched_args)
def main(args): exp = Experiment.from_dir(args.run, main='model') params = next(exp.params.itertuples()) # data setup transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: x.numpy()) ]) preproc = utils.PREPROC[params.dataset] if params.dataset == 'mnist': data = MNIST('data/mnist', download=True, train=False, transform=transform) elif params.dataset == 'cifar10': data = CIFAR10('data/cifar10', download=True, train=False, transform=transform) preproc = map(lambda x: np.array(x).reshape((3, 1, 1)), preproc) # expand dimensions preproc = tuple(preproc) t = np.linspace(0, 1, args.resolution + 1).tolist() # model setup model = utils.load_model(exp).eval().cuda() extractor = utils.load_model(exp).eval().cuda() extractor.to_features_extractor(keep_pool=False) extractor.odeblock.t1 = t if args.tol is None: args.tol = params.tol if params.model == 'odenet': model.odeblock.tol = args.tol extractor.odeblock.tol = args.tol fmodel = foolbox.models.PyTorchModel(model, bounds=(0, 1), num_classes=10, preprocessing=preproc) # attack setup if args.distance == 2: attack = foolbox.attacks.L2BasicIterativeAttack distance = foolbox.distances.MSE elif args.distance == float('inf'): attack = foolbox.attacks.LinfinityBasicIterativeAttack distance = foolbox.distances.Linf attack = attack(fmodel, distance=distance) sub_exp_root = exp.path_to('adv-attack') os.makedirs(sub_exp_root, exist_ok=True) sub_exp = Experiment(args, root=sub_exp_root, ignore=('run', 'resolution')) print(sub_exp) results_file = sub_exp.path_to('results.csv') diff_l2_file = sub_exp.path_to('diff_l2.csv') diff_cos_file = sub_exp.path_to('diff_cos.csv') if not os.path.exists(results_file): print('No results on attacks found:', results_file) return results = pd.read_csv(results_file).set_index('sample_id') diff_l2 = pd.read_csv(diff_l2_file) if os.path.exists(diff_l2_file) else pd.DataFrame() diff_cos = pd.read_csv(diff_cos_file) if os.path.exists(diff_cos_file) else pd.DataFrame() diff_cols = ['sample_id'] + t progress = tqdm(data) for i, (image, label) in enumerate(progress): if (not diff_l2.empty and not diff_cos.empty and i in diff_l2.sample_id.values and i in diff_cos.sample_id.values): continue # skipping, already computed perturbation_distance = results.at[i, 'distance'] if perturbation_distance == 0 or not np.isfinite(perturbation_distance): continue # skipping natural errors or not-found adversarials if not isinstance(label, int): label = label.item() start = time.time() adversarial = attack(image, label, unpack=False, binary_search=False, epsilon=args.epsilon) elapsed = time.time() - start if adversarial.perturbed is None: tqdm.write(f'WARN: adversarial not found when reproducing [sample_id = {i}]') continue with torch.no_grad(): original_image = torch.from_numpy(adversarial.unperturbed).cuda() original_traj = extractor(original_image.unsqueeze(0)) adversarial_image = torch.from_numpy(adversarial.perturbed).cuda() adversarial_traj = extractor(adversarial_image.unsqueeze(0)) adversarial_traj = adversarial_traj.reshape(args.resolution + 1, -1) original_traj = original_traj.reshape(args.resolution + 1, -1) """ L2 """ diff_traj = adversarial_traj - original_traj diff_traj = (diff_traj ** 2).sum(1).sqrt() diff_traj = diff_traj.cpu().numpy() tmp = pd.DataFrame([[i] + diff_traj.tolist()], columns=diff_cols) diff_l2 = diff_l2.append(tmp, ignore_index=True) diff_l2.to_csv(diff_l2_file, index=False) """ Cosine similarity """ diff_traj = F.cosine_similarity(adversarial_traj, original_traj) diff_traj = diff_traj.cpu().numpy() tmp = pd.DataFrame([[i] + diff_traj.tolist()], columns=diff_cols) diff_cos = diff_cos.append(tmp, ignore_index=True) diff_cos.to_csv(diff_cos_file, index=False)
def main(args): es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True) dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(5000, 2048)) x = utils.load_features(x, chunks=(5000, 2048)) n_queries, n_samples = q.shape[0], x.shape[0] if args.limit: x = x[:args.limit] if args.crelu: q = crelu(q) x = crelu(x) params = vars(args) ignore = ('output', 'force') progress = tqdm(zip(args.threshold, args.sq_factor), total=len(args.threshold)) for thr, s in progress: params['threshold'] = thr params['sq_factor'] = s progress.set_postfix({k: v for k, v in params.items() if k not in ignore}) exp = Experiment(params, root=args.output, ignore=ignore) density, density_file = exp.require_csv(f'density.csv') if 'query_density' not in density: progress.write('Computing query density ...') q_sq = thr_sq(q, thr, s) q_density = (q_sq != 0).mean(axis=0) q_density = utils.compute_if_dask(q_density) density['query_density'] = q_density density.to_csv(density_file, index=False) if 'database_density' not in density: progress.write('Computing database density ...') x_sq = thr_sq(x, thr, s) x_density = (x_sq != 0).mean(axis=0) x_density = utils.compute_if_dask(x_density) density['database_density'] = x_density density.to_csv(density_file, index=False) index_name = exp.name.lower() if not es.indices.exists(index_name) or es.count(index=index_name)['count'] < n_samples or args.force: # x_sq = thr_sq(x, thr, s) x_ids, _ = dataset.images() index_actions = generate_index_actions(es, index_name, x, x_ids, thr, s, 50) # index_actions = tqdm(index_actions, total=n_samples) progress.write(f'Indexing: {index_name}') index_config = { "mappings": { "_source": {"enabled": False}, # do not store STR "properties": {"repr": {"type": "text"}} # FULLTEXT }, "settings": { "index": {"number_of_shards": 1, "number_of_replicas": 0}, "analysis": {"analyzer": {"first": {"type": "whitespace"}}} } } # es.indices.delete(index_name, ignore=(400, 404)) es.indices.create(index_name, index_config, ignore=400) es.indices.put_settings({"index": {"refresh_interval": "-1", "number_of_replicas": 0}}, index_name) indexing = parallel_bulk(es, index_actions, thread_count=4, chunk_size=150, max_chunk_bytes=2**26) indexing = tqdm(indexing, total=n_samples) start = time.time() deque(indexing, maxlen=0) add_time = time.time() - start progress.write(f'Index time: {add_time}') es.indices.put_settings({"index": {"refresh_interval": "1s"}}, index_name) es.indices.refresh() index_stats_file = exp.path_to('index_stats.csv') index_stats = pd.DataFrame({'add_time': add_time}, index=[0]) index_stats.to_csv(index_stats_file, index=False) metrics, metrics_file = exp.require_csv(f'metrics.csv') scores = None scores_file = exp.path_to(f'scores.h5') if not os.path.exists(scores_file): progress.write('Computing scores...') xid2idx = {k: i for i, k in enumerate(dataset.images()[0])} q_sq = thr_sq(q, thr, s) q_sq = utils.compute_if_dask(q_sq, progress=False) scores = np.zeros((n_queries, n_samples), dtype=np.float32) query_times = [] for i, qi in enumerate(tqdm(q_sq)): query = { "query": {"query_string": {"default_field": "repr", "query": surrogate_text(qi, boost=True)}}, # "from": 0, "size": n_samples } start = time.time() for hit in tqdm(scan(es, query, index=index_name, preserve_order=True), total=n_samples): j = xid2idx[hit['_id']] scores[i, j] = hit['_score'] query_times.append(time.time() - start) metrics['query_time'] = query_times metrics.to_csv(metrics_file, index=False) progress.write(f'Query time: {metrics.query_time.sum()}') utils.save_as_hdf5(scores, scores_file, progress=True) if 'ap' not in metrics: if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing mAP...') metrics['ap'] = dataset.score(scores, reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) progress.write(f'mAP: {metrics.ap.mean()}') if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG...') metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG: {metrics.ndcg.mean()}')
def main(args): lucene_vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm.attachCurrentThread() dataset, q, x = utils.load_benchmark(args.dataset, args.features) q = utils.load_features(q, chunks=(5000, 2048)) x = utils.load_features(x, chunks=(5000, 2048)) if args.limit: x = x[:args.limit] n_queries, n_samples = q.shape[0], x.shape[0] if args.crelu: q = crelu(q) x = crelu(x) params = vars(args) ignore = ('output', 'force') progress = tqdm(zip(args.threshold, args.sq_factor), total=len(args.threshold)) for thr, s in progress: params['threshold'] = thr params['sq_factor'] = s progress.set_postfix( {k: v for k, v in params.items() if k not in ignore}) exp = Experiment(params, root=args.output, ignore=ignore) density, density_file = exp.require_csv(f'density.csv') if 'query_density' not in density: progress.write('Computing query density ...') q_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(q) else q q_sq = threshold(q_re, thr, s) q_density = (q_sq != 0).mean(axis=0) q_density = utils.compute_if_dask(q_density) density['query_density'] = q_density density.to_csv(density_file, index=False) if 'database_density' not in density: progress.write('Computing database density ...') x_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(x) else x x_sq = threshold(x_re, thr, s) x_density = (x_sq != 0).mean(axis=0) x_density = utils.compute_if_dask(x_density) density['database_density'] = x_density density.to_csv(density_file, index=False) index_stats, index_stats_file = exp.require_csv('index_stats.csv') index_name = exp.name.lower() index_path = exp.path_to('lucene_index') with LuceneIndex(index_path) as idx: if idx.count() < n_samples: x_sq = threshold(x, thr, s) x_sq = batch_features(x_sq, 5000) # x_str = features_to_str(x_sq, 5000) progress.write(f'Indexing: {index_name}') start = time.time() for i, xi in enumerate(tqdm(x_sq, total=n_samples)): idx.add(str(i), xi) add_time = time.time() - start progress.write(f'Index time: {add_time}') index_stats.at[0, 'add_time'] = add_time if 'size' not in index_stats.columns: index_stats.at[0, 'size'] = utils.get_folder_size(index_path) index_stats.to_csv(index_stats_file, index=False) metrics, metrics_file = exp.require_csv(f'metrics.csv') scores = None scores_file = exp.path_to(f'scores.h5') if not os.path.exists(scores_file): progress.write('Computing scores...') q_sq = threshold(q, thr, s) q_sq = utils.compute_if_dask(q_sq, progress=False) # q_str = features_to_str(q_sq, n_queries, boost=True) scores = np.zeros((n_queries, n_samples), dtype=np.float32) query_times = [] if True: # sequential version for i, qi in enumerate(tqdm(q_sq, total=n_queries)): start = time.time() if qi.any(): for j, score in tqdm(idx.query(qi, n_samples), total=n_samples): scores[i, int(j)] = score query_times.append(time.time() - start) else: query_times.append(None) else: # Parallel version (currently slower) idx._init_searcher() def _search(i, qi): lucene_vm.attachCurrentThread() scores_i = np.zeros(n_samples, dtype=np.float32) start = time.time() if qi.any(): for j, score in idx.query(qi, n_samples): scores_i[int(j)] = score query_time = time.time() - start else: query_time = None return scores_i, query_time queries = enumerate(tqdm(q_sq, total=n_queries)) scores_n_times = Parallel(n_jobs=6, prefer="threads")( delayed(_search)(i, qi) for i, qi in queries) scores, query_times = zip(*scores_n_times) scores = np.vstack(scores) metrics['query_time'] = query_times metrics.to_csv(metrics_file, index=False) progress.write(f'Query time: {metrics.query_time.sum()}') utils.save_as_hdf5(scores, scores_file, progress=True) if 'ap' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing mAP...') metrics['ap'] = dataset.score(scores, reduction=False, progress=True) metrics.to_csv(metrics_file, index=False) progress.write(f'mAP: {metrics.ap.mean()}') if 'ndcg' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True)) metrics['ndcg'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG: {metrics.ndcg.mean()}') if 'ndcg@25' not in metrics: dataset._load() # TODO in y_true getter if scores is None: progress.write('Loading scores...') scores = utils.load_features(scores_file)[...] progress.write('Computing nDCG@25...') y_true = dataset.y_true[:, :args. limit] if args.limit else dataset.y_true bs = 50 ndcg = [] for i in trange(0, y_true.shape[0], bs): ndcg.append( dcg(y_true[i:i + bs], scores[i:i + bs], p=25, normalized=True)) metrics['ndcg@25'] = np.concatenate(ndcg) # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True) metrics.to_csv(metrics_file, index=False) progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}')