def test(g, model, node_embed, y_true, device, split_idx, args): model.eval() category = 'paper' evaluator = Evaluator(name='ogbn-mag') sampler = dgl.dataloading.MultiLayerFullNeighborSampler(args['num_layers']) loader = dgl.dataloading.NodeDataLoader( g, {'paper': th.arange(g.num_nodes('paper'))}, sampler, batch_size=16384, shuffle=False, num_workers=0) N = y_true.size(0) pbar = tqdm(total=N) pbar.set_description(f'Full Inference') y_hats = list() for input_nodes, seeds, blocks in loader: blocks = [blk.to(device) for blk in blocks] seeds = seeds[ category] # we only predict the nodes with type "category" batch_size = seeds.shape[0] emb = extract_embed(node_embed, input_nodes) # Get the batch's raw "paper" features emb.update({'paper': g.ndata['feat']['paper'][input_nodes['paper']]}) if th.cuda.is_available(): emb = {k: e.cuda() for k, e in emb.items()} logits = model(emb, blocks)[category] y_hat = logits.log_softmax(dim=-1).argmax(dim=1, keepdims=True) y_hats.append(y_hat.cpu()) pbar.update(batch_size) pbar.close() y_pred = th.cat(y_hats, dim=0) y_true = th.unsqueeze(y_true, 1) train_acc = evaluator.eval({ 'y_true': y_true[split_idx['train']['paper']], 'y_pred': y_pred[split_idx['train']['paper']], })['acc'] valid_acc = evaluator.eval({ 'y_true': y_true[split_idx['valid']['paper']], 'y_pred': y_pred[split_idx['valid']['paper']], })['acc'] test_acc = evaluator.eval({ 'y_true': y_true[split_idx['test']['paper']], 'y_pred': y_pred[split_idx['test']['paper']], })['acc'] return train_acc, valid_acc, test_acc
def test(self, loader, eid): self.model.eval() y_true = {'train': [], 'valid': [], 'test': []} y_pred = {'train': [], 'valid': [], 'test': []} pbar = tqdm(total=len(loader)) pbar.set_description(f'PPO episode: {eid:01d}') cnt = 0 for data in loader: data = data.to(self.device) #TODO:nodes maybe concentrated in a cluster which leads to OOM out, _ = self.model(data.x, data.edge_index, data.edge_attr) for split in y_true.keys(): mask = data[f'{split}_mask'] y_true[split].append(data.y[mask].cpu()) y_pred[split].append(out[mask].cpu()) pbar.update(1) pbar.close() evaluator = Evaluator('ogbn-proteins') train_rocauc = evaluator.eval({ 'y_true': torch.cat(y_true['train'], dim=0), 'y_pred': torch.cat(y_pred['train'], dim=0), })['rocauc'] valid_rocauc = evaluator.eval({ 'y_true': torch.cat(y_true['valid'], dim=0), 'y_pred': torch.cat(y_pred['valid'], dim=0), })['rocauc'] test_rocauc = evaluator.eval({ 'y_true': torch.cat(y_true['test'], dim=0), 'y_pred': torch.cat(y_pred['test'], dim=0), })['rocauc'] return valid_rocauc
def get_ogb_evaluator(dataset): """ Get evaluator from Open Graph Benchmark based on dataset """ evaluator = Evaluator(name=dataset) return lambda preds, labels: evaluator.eval({ "y_true": labels.view(-1, 1), "y_pred": preds.view(-1, 1), })["acc"]
def eval(self, y_true, logits, split_idx): if self.name == 'ogb': evaluator = Evaluator(name='ogbn-arxiv') y_pred = logits.argmax(dim=1, keepdim=True) train_acc = evaluator.eval({ 'y_true': y_true[split_idx['train']], 'y_pred': y_pred[split_idx['train']], })['acc'] valid_acc = evaluator.eval({ 'y_true': y_true[split_idx['valid']], 'y_pred': y_pred[split_idx['valid']], })['acc'] test_acc = evaluator.eval({ 'y_true': y_true[split_idx['test']], 'y_pred': y_pred[split_idx['test']], })['acc'] return train_acc, valid_acc, test_acc elif self.name == 'wiki': y_pred = torch.sigmoid(logits) > 0.5 train_f1 = f1_score(y_true[split_idx['train']], y_pred[split_idx['train']], average='micro') valid_f1 = f1_score(y_true[split_idx['valid']], y_pred[split_idx['valid']], average='micro') test_f1 = f1_score(y_true[split_idx['test']], y_pred[split_idx['test']], average='micro') return train_f1, valid_f1, test_f1 elif self.name in self.heterophily_dataset: y_pred = logits.argmax(dim=1, keepdim=True) train_acc = accuracy_score(y_true[split_idx['train']], y_pred[split_idx['train']]) valid_acc = accuracy_score(y_true[split_idx['valid']], y_pred[split_idx['valid']]) test_acc = accuracy_score(y_true[split_idx['test']], y_pred[split_idx['test']]) return train_acc, valid_acc, test_acc
class OgbEvaluator(object): def __init__(self): d_name = "ogbn-arxiv" dataset = NodePropPredDataset(name=d_name) graph, label = dataset[0] self.num_nodes = graph["num_nodes"] self.ogb_evaluator = Evaluator(name="ogbn-arxiv") def eval(self, scores, labels, phase): pred = (np.argmax(scores, axis=1)).reshape([-1, 1]) ret = {} ret['%s_acc' % (phase)] = self.ogb_evaluator.eval({ 'y_true': labels, 'y_pred': pred, })['acc'] return ret
def get_ogb_evaluator(predicts: torch.Tensor, labels: torch.Tensor): """ get evaluation metrics for ogb, calculate accuracy :param predicts: Tensor, shape (N, ) :param labels: Tensor, shape (N, ) :return: """ evaluator = Evaluator(name='ogbn-mag') predictions = predicts.cpu().numpy() labels = labels.cpu().numpy() accuracy = evaluator.eval({ "y_true": labels.reshape(-1, 1), "y_pred": predictions.reshape(-1, 1) })['acc'] return accuracy
def eval_on(self, loader, trainer): results_dict = super().eval_on(loader, trainer) evaluator = NodePropEvaluator(name=self.task_name) y_trues = [] y_preds = [] for batch in loader: if trainer.on_gpu: batch = batch.to("cuda") y_preds.append(self.model(batch).cpu().detach().numpy()) y_trues.append(batch.y.cpu().detach().numpy()) y_trues = np.concatenate(y_trues, axis=0) y_preds = np.concatenate(y_preds, axis=0) results_dict.update( evaluator.eval({ "y_true": y_trues, "y_pred": y_preds })) return results_dict
def evaluate_node_classification(predicts: torch.Tensor, labels: torch.Tensor): """ get evaluation metrics for node classification, calculate accuracy and macro_f1 metrics :param predicts: Tensor, shape (N, ) :param labels: Tensor, shape (N, ) :return: """ evaluator = Evaluator(name='ogbn-mag') predictions = predicts.cpu().numpy() labels = labels.cpu().numpy() accuracy = evaluator.eval({ "y_true": labels.reshape(-1, 1), "y_pred": predictions.reshape(-1, 1) })['acc'] macro_f1 = f1_score(y_true=labels, y_pred=predictions, average='macro') return accuracy, macro_f1
n_batch=args.vr_num) for node_feature, node_type, edge_time, edge_index, edge_type, ( train_mask, valid_mask, test_mask), ylabel in test_data: node_rep = gnn.forward(node_feature.to(device), node_type.to(device), \ edge_time.to(device), edge_index.to(device), edge_type.to(device)) res = classifier.forward(node_rep[:args.batch_size]) ress += [res] y_pred += torch.stack(ress).mean(dim=0).argmax(dim=1).tolist() y_true += list(ylabel[:args.batch_size]) test_acc = evaluator.eval({ 'y_true': torch.LongTensor(y_true).unsqueeze(-1), 'y_pred': torch.LongTensor(y_pred).unsqueeze(-1) })['acc'] monitor.set_postfix(accuracy=test_acc) elif args.task_type == 'sequential': y_pred = [] y_true = [] pool = mp.Pool(args.n_pool) jobs = prepare_data(pool, task_type='sequential', s_idx=0, n_batch=args.n_batch, batch_size=args.batch_size) with tqdm(np.arange( len(graph.test_paper) / args.n_batch // args.batch_size),
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator import torch_geometric.transforms as T from torch_geometric.nn import LabelPropagation root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OGB') dataset = PygNodePropPredDataset('ogbn-arxiv', root, transform=T.Compose([ T.ToUndirected(), T.ToSparseTensor(), ])) split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbn-arxiv') data = dataset[0] model = LabelPropagation(num_layers=3, alpha=0.9) out = model(data.y, data.adj_t, mask=split_idx['train']) y_pred = out.argmax(dim=-1, keepdim=True) val_acc = evaluator.eval({ 'y_true': data.y[split_idx['valid']], 'y_pred': y_pred[split_idx['valid']], })['acc'] test_acc = evaluator.eval({ 'y_true': data.y[split_idx['test']], 'y_pred': y_pred[split_idx['test']], })['acc'] print(f'Val: {val_acc:.4f}, Test: {test_acc:.4f}')
class NodeClassificationTrainer(metaclass=ABCMeta): def __init__(self, model, g, features, optimizer, stopper, loss_func, sup, cf): self.trainer = None self.model = model self.g = g.cpu() self.features = features self.optimizer = optimizer self.stopper = stopper self.loss_func = loss_func self.cf = cf self.device = cf.device self.epochs = cf.epochs self.n_class = cf.n_class self.__dict__.update(sup.__dict__) self.train_x, self.val_x, self.test_x = \ [_.to(cf.device) for _ in [sup.train_x, sup.val_x, sup.test_x]] self.labels = sup.labels.to(cf.device) self._evaluator = Evaluator(name='ogbn-arxiv') self.evaluator = lambda pred, labels: self._evaluator.eval( { "y_pred": pred.argmax(dim=-1, keepdim=True), "y_true": labels.view(-1, 1) })["acc"] @abstractmethod def _train(self): return None, None @abstractmethod def _evaluate(self): return None, None def run(self): for epoch in range(self.epochs): t0 = time() loss, train_acc = self._train() val_acc, test_acc = self._evaluate() print_log({ 'Epoch': epoch, 'Time': time() - t0, 'loss': loss, 'TrainAcc': train_acc, 'ValAcc': val_acc, 'TestAcc': test_acc }) if self.stopper is not None: if self.stopper.step(val_acc, self.model, epoch): print( f'Early stopped, loading model from epoch-{self.stopper.best_epoch}' ) break if self.stopper is not None: self.model.load_state_dict(th.load(self.stopper.path)) return self.model def eval_and_save(self): val_acc, test_acc = self._evaluate() res = {'test_acc': f'{test_acc:.4f}', 'val_acc': f'{val_acc:.4f}'} if self.stopper is not None: res['best_epoch'] = self.stopper.best_epoch save_results(self.cf, res)
# print('res size={}, batch.y size={}'.format(res.size(), len(batch.y))) batch.y = torch.LongTensor(batch.y).to(device) loss = criterion(res, batch.y) single_epoch['train_loss'].append(loss.cpu().detach()) optimizer.zero_grad() torch.cuda.empty_cache() loss.backward() train_losses += [loss.cpu().detach().tolist()] train_step += 1 scheduler.step(train_step) y_pred.append(res.detach().cpu().argmax(dim=1)) ylabel.append(batch.y.cpu()) del res, loss # print('ylabel', torch.cat(ylabel,dim=-1).size()) # print('y_pred', torch.cat(y_pred,dim=-1).size()) train_acc = evaluator.eval({'y_true': torch.cat(ylabel,dim=-1).view(-1,1), 'y_pred': torch.cat(y_pred,dim=-1).view(-1,1)})['acc'] del loader, train_data print('end of training') print(("Epoch: {} {}s LR: {} Train Loss: {} Last train Acc:{}").format(\ epoch, (st - et), optimizer.param_groups[0]['lr'], np.average(train_losses), \ train_acc)) model.eval() with torch.no_grad(): y_pred = [] ylabel = [] pool.close() pool.join() pool = mp.Pool(args.n_pool) jobs = prepare_data(pool, task_type = 'sequential', s_idx = 0, n_batch = args.n_batch, batch_size=args.batch_size)
test_res = classifier.forward(node_rep[:len(ylabel)][test_mask]) train_loss = criterion(train_res, ylabel[train_mask]) optimizer.zero_grad() train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() train_step += 1 scheduler.step(train_step) train_acc = evaluator.eval({ 'y_true': ylabel[train_mask].unsqueeze(-1), 'y_pred': train_res.argmax(dim=1).unsqueeze(-1) })['acc'] valid_acc = evaluator.eval({ 'y_true': ylabel[valid_mask].unsqueeze(-1), 'y_pred': valid_res.argmax(dim=1).unsqueeze(-1) })['acc'] test_acc = evaluator.eval({ 'y_true': ylabel[test_mask].unsqueeze(-1), 'y_pred': test_res.argmax(dim=1).unsqueeze(-1) })['acc'] stat += [[train_loss.item(), train_acc, valid_acc, test_acc]]
class Metrics: full_graph_name = { 'arxiv': 'ogbn-arxiv', 'products': 'ogbn-products', 'papers100M': 'ogbn-papers100M' } def __init__(self, name_data, is_sigmoid: bool, metric: str): self.name_data = name_data self.is_sigmoid = is_sigmoid self.name = metric if metric == 'f1': self.calc = self._calc_f1 self.is_better = self._is_better_f1 elif metric == 'accuracy': self.calc = self._calc_accuracy self.is_better = self._is_better_accuracy elif metric == 'accuracy_ogb': self.evaluator = Evaluator(name=self.full_graph_name[name_data]) self.calc = self._calc_accuracy_ogb self.is_better = self._is_better_accuracy else: raise NotImplementedError def _calc_f1(self, y_true, y_pred): """ Compute F1-score (micro- and macro averaged for multiple classes). NOTE: for the case of each node having a single label (e.g., ogbn-arxiv), F1-micro score is equivalent to accuracy. """ if not self.is_sigmoid: y_true = np.argmax(y_true, axis=1) y_pred = np.argmax(y_pred, axis=1) else: y_pred[y_pred > 0.5] = 1 y_pred[y_pred <= 0.5] = 0 return { 'f1mic': metrics.f1_score(y_true, y_pred, average="micro"), 'f1mac': metrics.f1_score(y_true, y_pred, average="macro") } def _calc_accuracy(self, y_true, y_pred): y_true = np.argmax(y_true, axis=1) y_pred = np.argmax(y_pred, axis=1) # if each node has only 1 ground truth label, accuracy is equivalent to f1-micro return {'accuracy': metrics.f1_score(y_true, y_pred, average="micro")} def _calc_accuracy_ogb(self, y_true, y_pred): """ This function is equivalent to _calc_accuracy. We just do this to conform to the leaderboard requirement """ y_true = np.argmax(y_true, axis=1)[:, np.newaxis] y_pred = np.argmax(y_pred, axis=1)[:, np.newaxis] acc = self.evaluator.eval({'y_true': y_true, 'y_pred': y_pred})['acc'] return {'accuracy': acc} def _is_better_accuracy(self, loss_all, loss_min_hist, accuracy_all, accuracy_max_hist): acc_cur = accuracy_all[-1] return acc_cur > accuracy_max_hist def _is_better_f1(self, loss_all, loss_min_hist, f1mic_all, f1mic_max_hist, f1mac_all, f1mac_max_hist): f1mic_cur = f1mic_all[-1] return f1mic_cur > f1mic_max_hist
def main(): data_name = "_".join(FLAGS.dataset.split("-")) adj_full, features, train_nodes, y_train, \ valid_nodes, y_valid, test_nodes, y_test = load_data(FLAGS.dataset) print("Finish loading data.") # adj_train equals adj_full in ogb dataset adj_train = adj_full evaluator = Evaluator(name=FLAGS.dataset) eval_key = None if FLAGS.dataset == "ogbn-proteins": eval_key = "rocauc" elif FLAGS.dataset == "ogbn-products": eval_key = "acc" sampler = BanditMPSampler() sampler.init(adj_train) print("Finish init sampler.") n2n_values = np.ones(adj_full.count_nonzero(), dtype=np.float32) feature_dim = features.shape[-1] label_dim = y_train.shape[-1] num_supports = 2 numNode = adj_full.shape[0] # Define placeholders placeholders = { 'support': tf.sparse_placeholder(tf.float32), 'features': tf.placeholder(tf.float32, shape=(None, feature_dim)), 'node_select': tf.sparse_placeholder(tf.float32), 'labels': tf.placeholder(tf.float32, shape=(None, label_dim)), 'dropout': tf.placeholder_with_default(0., shape=()), 'left': tf.sparse_placeholder(tf.float32), 'right': tf.sparse_placeholder(tf.float32), 'n_nd': tf.placeholder(tf.int32, shape=[]), } # Define task type task_type_dict = { "ogbn-proteins": "multi-label", "ogbn-products": "exclusive-label", } task_type = task_type_dict[FLAGS.dataset] # Create model model = GeniePath(task_type, placeholders, input_dim=features.shape[-1], label_dim=label_dim) # Initialize session sess = tf.Session() # Construct val feed dictionary support, left, right, node_select = gen_fullgraph(valid_nodes, adj_full, n2n_values) val_feed_dict = construct_feed_dict(adj_full.shape[0], features, node_select, support, left, right, y_valid, placeholders) val_feed_dict.update({placeholders['dropout']: 0.}) # Construct test feed dictionary support, left, right, node_select = gen_fullgraph(test_nodes, adj_full, n2n_values) test_feed_dict = construct_feed_dict(adj_full.shape[0], features, node_select, support, left, right, y_test, placeholders) test_feed_dict.update({placeholders['dropout']: 0.}) # Define model evaluation function def evaluate(labels, feed_dict): outs = sess.run([model.outputs], feed_dict=feed_dict) preds = outs[0].tolist() eval_true = np.array(labels) eval_pred = np.array(preds) # evaluate if task_type == "exclusive-label": eval_true = np.argmax(eval_true, axis=1).reshape([-1, 1]) eval_pred = np.argmax(eval_pred, axis=1).reshape([-1, 1]) eval_res = evaluator.eval({ "y_true": eval_true, "y_pred": eval_pred })[eval_key] return eval_res # Init variables sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if not os.path.exists("./save_models"): os.mkdir("./save_models") # Save initial model initial_version = 0 if not os.path.exists("./save_models/%s" % FLAGS.dataset): os.mkdir("./save_models/%s" % FLAGS.dataset) saver.save( sess, "./save_models/{}/{}.ckpt".format(FLAGS.dataset, initial_version)) # run 10 times results = [] for rnd in range(1, 11): # reset model parameters saver.restore( sess, "./save_models/{}/{}.ckpt".format(FLAGS.dataset, initial_version)) train_true = [] train_pred = [] best_va = 0 # Train model for epoch in range(FLAGS.epochs): n = 0 train_losses = [] tic = time.time() for batch in iterate_minibatches([train_nodes, y_train], batchsize=FLAGS.batchsize, shuffle=True): batch_nodes, y_batch = batch subgraph_nodes, support, left, right, node_select, src_list, dst_list, node_map = \ gen_subgraph(sampler, batch_nodes, adj_train, neighbor_limit=FLAGS.neighbor_limit) features_inputs = features[subgraph_nodes, :] # Construct feed dictionary feed_dict = construct_feed_dict(len(node_map), features_inputs, node_select, support, left, right, y_batch, placeholders) feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # Training step outs = sess.run([ model.opt_op, model.loss, model.sparse_attention_l0, model.outputs ], feed_dict=feed_dict) train_losses.append(outs[1]) # Update sample probs sampler.update(np.array(src_list, dtype=np.int32), np.array(dst_list, dtype=np.int32), outs[2]) train_true.extend(y_batch.tolist()) train_pred.extend(outs[3].tolist()) # compute Train eval if task_type == "exclusive-label": train_true = np.argmax(train_true, axis=1).reshape([-1, 1]) train_pred = np.argmax(train_pred, axis=1).reshape([-1, 1]) eval_tr = evaluator.eval({ "y_true": np.array(train_true), "y_pred": np.array(train_pred) })[eval_key] train_true = [] train_pred = [] # Valid eval_va = evaluate(y_valid, val_feed_dict) print("Round:", '%02d' % rnd, "Epoch:", '%04d' % (epoch + 1), "loss=", "{:.4f}".format(np.mean(train_losses)), "{}_tr=".format(eval_key), "{:.4f}".format(eval_tr), "{}_va=".format(eval_key), "{:.4f}".format(eval_va)) if eval_va > best_va: best_va = eval_va if not os.path.exists("./save_models/%s" % FLAGS.dataset): os.mkdir("./save_models/%s" % FLAGS.dataset) saver.save( sess, "./save_models/{}/{}.ckpt".format(FLAGS.dataset, rnd)) # Testing saver.restore(sess, "./save_models/{}/{}.ckpt".format(FLAGS.dataset, rnd)) eval_te = evaluate(y_test, test_feed_dict) print("Round:", '%02d' % rnd, "Test=", "{:.4f}".format(eval_te)) results.append(eval_te) # print result results = np.array(results) print('Final Test: {:.4f} ± {:.4f}'.format(results.mean(), results.std()))
def main(): """main """ # Training settings parser = argparse.ArgumentParser(description='Graph Dataset') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--dataset', type=str, default="ogbn-proteins", help='dataset name (default: proteinfunc)') args = parser.parse_args() #device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") #place = fluid.CUDAPlace(0) place = fluid.CPUPlace() # Dataset too big to use GPU ### automatic dataloading and splitting dataset = PglNodePropPredDataset(name=args.dataset) splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) graph_data, label = dataset[0] train_program = fluid.Program() startup_program = fluid.Program() test_program = fluid.Program() # degree normalize indegree = graph_data.indegree() norm = np.zeros_like(indegree, dtype="float32") norm[indegree > 0] = np.power(indegree[indegree > 0], -0.5) graph_data.node_feat["norm"] = np.expand_dims(norm, -1).astype("float32") graph_data.node_feat["x"] = np.zeros((len(indegree), 1), dtype="int64") graph_data.edge_feat["feat"] = graph_data.edge_feat["feat"].astype( "float32") model = GNNModel(name="gnn", num_task=dataset.num_tasks, emb_dim=64, num_layers=2) with fluid.program_guard(train_program, startup_program): gw = pgl.graph_wrapper.StaticGraphWrapper("graph", graph_data, place) pred = model.forward(gw) sigmoid_pred = fluid.layers.sigmoid(pred) val_program = train_program.clone(for_test=True) initializer = [] with fluid.program_guard(train_program, startup_program): train_node_index, init = paddle_helper.constant( "train_node_index", dtype="int64", value=splitted_idx["train"]) initializer.append(init) train_node_label, init = paddle_helper.constant( "train_node_label", dtype="float32", value=label[splitted_idx["train"]].astype("float32")) initializer.append(init) train_pred_t = fluid.layers.gather(pred, train_node_index) train_loss_t = fluid.layers.sigmoid_cross_entropy_with_logits( x=train_pred_t, label=train_node_label) train_loss_t = fluid.layers.reduce_sum(train_loss_t) train_pred_t = fluid.layers.sigmoid(train_pred_t) adam = fluid.optimizer.Adam( learning_rate=1e-2, regularization=fluid.regularizer.L2DecayRegularizer( regularization_coeff=0.0005)) adam.minimize(train_loss_t) exe = fluid.Executor(place) exe.run(startup_program) gw.initialize(place) for init in initializer: init(place) for epoch in range(1, args.epochs + 1): loss = exe.run(train_program, feed={}, fetch_list=[train_loss_t]) print("Loss %s" % loss[0]) print("Evaluating...") y_pred = exe.run(val_program, feed={}, fetch_list=[sigmoid_pred])[0] result = {} input_dict = { "y_true": label[splitted_idx["train"]], "y_pred": y_pred[splitted_idx["train"]] } result["train"] = evaluator.eval(input_dict) input_dict = { "y_true": label[splitted_idx["valid"]], "y_pred": y_pred[splitted_idx["valid"]] } result["valid"] = evaluator.eval(input_dict) input_dict = { "y_true": label[splitted_idx["test"]], "y_pred": y_pred[splitted_idx["test"]] } result["test"] = evaluator.eval(input_dict) print(result)