def test(self, dataloader): device = self.args.device pred = [] self.classifier.eval() labels = [] scores_all = [] with torch.no_grad(): for data in dataloader: _input, _label, _name = data # _input = _input.to(device) _label = _label.long() out = self.classifier(_input) pred.append(out.argmax(1).data.cpu().numpy()) labels.append(_label.data.numpy()) scores_all.append(out.data.cpu().numpy()) pred_labels = np.hstack(pred) labels = np.hstack(labels) if scores_all[0].ndim == 1: scores_all = np.hstack(scores_all) else: scores_all = np.vstack(scores_all)[:, 1] # acc = np.mean(pred_labels == labels) * 100 # fpr, tpr, thresholds = metrics.roc_curve(labels, scores_all) # auc = metrics.auc(fpr, tpr) # ce = cross_entropy(scores_all, labels) auc, acc, ce = compute_metrics(scores_all, labels) return acc, auc, ce, (scores_all, labels)
def test(self, val_models, val_labels): with torch.no_grad(): pred = list() device = self.args.device for i, model in enumerate(tqdm(val_models)): cnn = self.load_model(model) # cnn.fc[2].register_forward_hook(hook_fn_logit_layer) cnn.eval() cnn.to(device) logit = self.compute_logit( cnn, self.classifier.X, self.classifier.W, self.classifier.b ) pred.append(logit.data.cpu().numpy()) # pred.append(torch.argmax(logit, 1)) scores_all = np.vstack(pred)[:, 1] auc, acc, ce = compute_metrics(scores_all, val_labels) return acc, auc, ce, (scores_all, val_labels)
def test(self, dataloader): device = self.args.device pred = [] self.classifier.eval() labels = [] scores_all = [] with torch.no_grad(): for data in dataloader: _input, _valid, _label, _arch, _name = data _input = _input.to(device) _valid = _valid.to(device) if self.args.stocastic: out_lst = [] pred_lst = [] for _ in range(self.args.T): _out = self.classifier((_input, _valid, _arch)) out_lst.append(_out) pred_lst.append(_out.argmax(1)) if self.args.hard: out = torch.stack(pred_lst).float().mean(0) pred.append((out > 0.5).long().data.cpu().numpy()) else: out = torch.stack(out_lst).mean(0) pred.append(out.argmax(1).data.cpu().numpy()) else: out = self.classifier((_input, _valid, _arch)) pred.append(out.argmax(1).data.cpu().numpy()) labels.append(_label.data.numpy()) scores_all.append(out.data.cpu().numpy()) pred_labels = np.hstack(pred) labels = np.hstack(labels) if scores_all[0].ndim == 1: scores_all = np.hstack(scores_all) else: scores_all = np.vstack(scores_all)[:, 1] # acc = np.mean(pred_labels == labels) * 100 # fpr, tpr, thresholds = metrics.roc_curve(labels, scores_all) # auc = metrics.auc(fpr, tpr) # ce = cross_entropy(scores_all, labels) auc, acc, ce = compute_metrics(scores_all, labels) return acc, auc, ce, (scores_all, labels)
# Building ensemble (average) test_info = [] for num_models in range(args.num_ensemble): print( f"Training ensemble model {num_models} / {args.num_ensemble}") valid_acc, valid_auc, valid_ce, valid_info = model.train( dl_train, dl_val) acc, auc, ce, _test_info = model.test(dl_test) test_info.append(_test_info) print(f"validation auc = {valid_auc}\ntest auc = {auc}") prob = np.asarray([sigmoid(it[0]) for it in test_info]).mean(0) # Convert back to logits scores = np.log(prob / (1 - prob + 1e-10)) labels = test_info[0][1] auc, acc, ce = compute_metrics(scores, labels) print(f"Fold {i}\nAcc: {acc:.2f}\nAuc: {auc:.2f}\nCE: {ce:.2f}") # auc = train_model( # model, # patience, # n_epochs, # train_loader, # valid_loader, # optimizer, # criterion, # device, # ckt_path=ckt_path, # ) # acc = 0
def main(args): where_to_save = os.path.join(args.save_dir, args.project_name, args.model_name) checkpoints_dir = os.path.join(where_to_save, 'checkpoints') logs_dir = os.path.join(where_to_save, 'logs') if not args.log_term: sys.stdout = open(os.path.join(logs_dir, 'eval.log'), 'w') sys.stderr = open(os.path.join(logs_dir, 'eval.err'), 'w') transforms_list = [] transforms_list.append(transforms.Resize((args.crop_size))) transforms_list.append(transforms.CenterCrop(args.crop_size)) transforms_list.append(transforms.ToTensor()) transforms_list.append( transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))) #Image preprocessing transform = transforms.Compose(transforms_list) #Data loader data_loader, dataset = get_loader(args.data_dir, 'test', args.maxnumlabels, batch_size=args.batch_size, transform=transform, shuffle=False, num_workers=args.num_workers, drop_last=False, max_num_samples=-1) ingr_vocab_size = dataset.get_ingrs_vocab_size() args.numgens = 1 #Build the model model = get_model(args, ingr_vocab_size) model_path = os.path.join(args.save_dir, args.project_name, args.model_name, 'checkpoints', 'modelbest.ckpt') model.load_state_dict(torch.load(model_path, map_location=map_loc)) model.eval() model = model.to(device) error_types = { 'tp_i': 0, 'fp_i': 0, 'fn_i': 0, 'tn_i': 0, 'tp_all': 0, 'fp_all': 0, 'fn_all': 0 } for i, (img_inputs, ingr_gt, img_id, path) in tqdm(enumerate(data_loader)): ingr_gt = ingr_gt.to(device) img_inputs = img_inputs.to(device) for gens in range(args.numgens): with torch.no_grad(): outputs = model.sample(img_inputs) fake_ingrs = outputs['ingr_ids'] pred_one_hot = label2onehot(fake_ingrs, ingr_vocab_size - 1) target_one_hot = label2onehot(ingr_gt, ingr_vocab_size - 1) update_error_types(error_types, pred_one_hot, target_one_hot) ret_metrics = {'accuracy': [], 'f1': []} compute_metrics(ret_metrics, error_types, ['accuracy', 'f1'], eps=1e-10, weights=None) for k, v in ret_metrics.items(): print(k, np.mean(v))