def feature_analysis(config):
    dataset = MarkerExpressionDataset(config)

    model = get_model(config)
    for marker in dataset.markers:
        all_features, all_labels, _ = dataset.get_all_data(
            marker, feature_selection=False, dup_reduce=True)
        model = base.clone(model)
        all_pred_labels = [0 for _ in range(len(all_labels))]
        all_pred_score = [0 for _ in range(len(all_labels))]
        for i in range(len(all_features)):
            train_features = all_features.copy()
            train_labels = all_labels.copy()
            del train_features[i]
            del train_labels[i]
            # model.fit(all_features, all_labels)
            model.fit(train_features, train_labels)
            all_pred_score[i] = model.predict_proba([all_features[i]])[0]
            all_pred_labels[i] = model.predict([all_features[i]])[0]
        tps = sum([
            y_true == y_pred
            for y_true, y_pred in zip(all_labels, all_pred_labels)
        ])
        acc = tps / len(all_features)
        results = eval_results(all_labels, all_pred_score, dataset.classes)
        print('marker %s: acc %1.2f' % (marker, 100 * acc))
        print(results)
예제 #2
0
def evaluate(model, eval_examples, eval_features, device, args, logger,use_squad_v2, eval_file ):
    logger.info("***** Running predictions *****")
    logger.info("  Num orig examples = %d", len(eval_examples))
    logger.info("  Num split examples = %d", len(eval_features))
    logger.info("  Batch size = %d", args.predict_batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
    all_ling_features = torch.tensor([f.ling_features for f in eval_features], dtype=torch.float)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_ling_features, all_example_index)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)

    model.eval()
    all_results = []
    logger.info("Start evaluating")
    for input_ids, input_mask, segment_ids, ling_features, example_indices in tqdm(eval_dataloader, desc="Evaluating"):
        if len(all_results) % 1000 == 0:
            logger.info("Processing example: %d" % (len(all_results)))
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        ling_features = ling_features.to(device)
        with torch.no_grad():
            batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask, ling_features)
        for i, example_index in enumerate(example_indices):
            start_logits = batch_start_logits[i].detach().cpu().tolist()
            end_logits = batch_end_logits[i].detach().cpu().tolist()
            eval_feature = eval_features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            all_results.append(RawResult(unique_id=unique_id,
                                         start_logits=start_logits,
                                         end_logits=end_logits))
    model.train()

    results = eval_results(eval_examples, eval_features, all_results, eval_file,
        args.n_best_size, args.max_answer_length,
        args.do_lower_case, args.verbose_logging,
        args.version_2_with_negative, args.null_score_diff_threshold)

    results_list = [('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, all_results
예제 #3
0
def evaluate(args, model, tokenizer, prefix=""):
    eval_dataset = TopNDataset(args.topN_file, tokenizer, "dev.small",
                               args.msmarco_dir, args.collection_memmap_dir,
                               args.tokenize_dir, args.max_query_length,
                               args.max_seq_length)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=get_collate_function(
                                     args.mask_target))

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    attention_mask_after_softmax_layer_set = set(
        list(range(args.mask_layer_num)))

    logger.info("attention_mask_after_softmax_layer_set: {}".format(
        attention_mask_after_softmax_layer_set))
    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    cnt = 0
    with open(args.output_score_path, 'w') as outputfile:
        for batch, qids, pids in tqdm(
                eval_dataloader,
                desc=f"{args.mask_target}@{args.mask_layer_num}"):
            model.eval()
            batch = {k: v.to(args.device) for k, v in batch.items()}
            batch[
                'attention_mask_after_softmax_layer_set'] = attention_mask_after_softmax_layer_set
            with torch.no_grad():
                outputs = model(**batch)
                scores = outputs[0].detach().cpu().numpy()
                for qid, pid, score in zip(qids, pids, scores[:, 1]):
                    outputfile.write(f"{qid}\t{pid}\t{score}\n")
            cnt += 1
            # if cnt > 1000:
            #     break
    generate_rank(args.output_score_path, args.output_rank_path)
    mrr = eval_results(args.output_rank_path)
    abs_output_rank_path = os.path.abspath(args.output_rank_path)
    mrr_ln_path = f"{abs_output_rank_path}.{mrr:.3f}"
    subprocess.check_call(["ln", "-s", abs_output_rank_path, mrr_ln_path])
    print(mrr)
예제 #4
0
def evaluate(args, model, mode, prefix):
    eval_output_dir = args.eval_save_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    eval_dataset = MSMARCODataset(mode, args.msmarco_dir,
                                  args.collection_memmap_dir,
                                  args.tokenize_dir, args.max_query_length,
                                  args.max_doc_length)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    collate_fn = get_collate_function(mode=mode)
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 num_workers=args.data_num_workers,
                                 collate_fn=collate_fn)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv"
    with open(output_file_path, 'w') as outputfile:
        for batch, qids, docids in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            with torch.no_grad():
                batch = {k: v.to(args.device) for k, v in batch.items()}
                outputs = model(**batch)
                scores = torch.diagonal(outputs[0]).detach().cpu().numpy()
                assert len(qids) == len(docids) == len(scores)
                for qid, docid, score in zip(qids, docids, scores):
                    outputfile.write(f"{qid}\t{docid}\t{score}\n")

    rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv"
    generate_rank(output_file_path, rank_output)

    if mode == "dev":
        mrr = eval_results(rank_output)
        return mrr
예제 #5
0
파일: train.py 프로젝트: KaishuaiXu/CLEAR
def evaluate(args, model, mode, prefix, eval_dataset=None):
    eval_output_dir = args.eval_save_dir
    if not os.path.exists(eval_output_dir):
        os.makedirs(eval_output_dir)

    if eval_dataset == None:
        eval_dataset = CLEARDataset(mode=mode, args=args)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    collate_fn = get_collate_function(mode=mode)
    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=args.eval_batch_size,
                                 num_workers=args.data_num_workers,
                                 pin_memory=True,
                                 collate_fn=collate_fn)

    # multi-gpu eval
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    output_file_path = f"{eval_output_dir}/{prefix}.{mode}.score.tsv"
    with open(output_file_path, 'w') as outputfile:
        for batch, qids, pids in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            with torch.no_grad():
                batch = {k: v.to(args.device) for k, v in batch.items()}
                scores = model(**batch)
                assert len(qids) == len(pids) == len(scores)
                for qid, pid, score in zip(qids, pids, scores):
                    outputfile.write(f"{qid}\t{pid}\t{score}\n")

    rank_output = f"{eval_output_dir}/{prefix}.{mode}.rank.tsv"
    generate_rank(output_file_path, rank_output)

    if mode == "dev.small":
        mrr = eval_results(rank_output) * 6980 / args.num_eval_queries
        return mrr
예제 #6
0
def evaluate(args, model):
    eval_dataset = ProbDataset(
        f"{args.embd_root}/dev.small/{args.key}/{args.layer}",
        args.msmarco_dir, "dev.small", args.max_token_num)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset,
                                 sampler=eval_sampler,
                                 pin_memory=False,
                                 batch_size=args.eval_batch_size,
                                 collate_fn=get_collate_function(),
                                 num_workers=args.data_num_workers)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    model.eval()
    output_score_path = f"{args.eval_output_dir}/layer_{args.layer}.score.tsv"
    output_rank_path = f"{args.eval_output_dir}/layer_{args.layer}.rank.tsv"
    with open(output_score_path, 'w') as outputfile:
        for batch_idx, (batch, qids, pids) in enumerate(
                tqdm(eval_dataloader, desc="Evaluating")):
            del batch['labels']
            batch = {k: v.to(args.device) for k, v in batch.items()}
            with torch.no_grad():
                softmax_logits = model(**batch)[0].detach().cpu().numpy()
                scores = softmax_logits[:, 1]
                for idx, (qid, pid,
                          score) in enumerate(zip(qids, pids, scores)):
                    outputfile.write(f"{qid}\t{pid}\t{score}\n")
    generate_rank(output_score_path, output_rank_path)
    mrr = eval_results(output_rank_path)
    abs_output_rank_path = os.path.abspath(output_rank_path)
    mrr_ln_path = f"{abs_output_rank_path}.{mrr:.3f}"
    subprocess.check_call(["ln", "-s", abs_output_rank_path, mrr_ln_path])
예제 #7
0
def train_eval(config, exp_path):
    dataset = MarkerExpressionDataset(config)
    if dataset.data_clean is not None:
        with open(os.path.join(exp_path, 'dirty_data.txt'), 'w') as f:
            f.write('---data clean method: %s---\n' % dataset.data_clean)
            for marker, item in dataset.outlier_samples.items():
                f.write('marker %s:\n' % marker)
                for class_id in dataset.classes:
                    f.write('class %s:\n' % class_id)
                    for sample_id in item.keys():
                        if item[sample_id]['class'] == class_id:
                            f.write('\t%s\n' % sample_id)

    if dataset.feature_selection is not None or dataset.feature_transformation is not None:
        with open(
                os.path.join(exp_path,
                             'feature_selection_and_transformation.txt'),
                'w') as f:
            if dataset.feature_selection is not None:
                f.write('---feature selection method: %s---\n' %
                        dataset.feature_selection['method'])
                if 'kwargs' in dataset.feature_selection:
                    f.write('---feature selection kwargs: %s---\n' %
                            str(dataset.feature_selection['kwargs']))
            if dataset.feature_transformation is not None:
                f.write('---feature transformation method: %s---\n' %
                        dataset.feature_transformation['method'])
                if 'kwargs' in dataset.feature_transformation:
                    f.write('---feature transformation kwargs: %s---\n' %
                            str(dataset.feature_transformation['kwargs']))

            for marker in dataset.markers:
                f.write('marker %s:\n' % marker)
                if dataset.fs_metric_params is not None:
                    f.write(
                        '---feature selection and transformation kwargs: %s---\n'
                        % str(dataset.fs_metric_params[marker]))
                if dataset.feature_selection is not None:
                    features = dataset.features
                    feature_index = 0
                    f.write('---selected features---\n')
                    if dataset.feature_selection['method'] == 'custom':
                        support_flags = dataset.feature_selection['selection'][
                            marker]
                    else:
                        support_flags = dataset.feature_selector[
                            marker].get_support()
                    for flag in support_flags:
                        f.write('%s:\t%s\n' % (features[feature_index], flag))
                        feature_index = (feature_index + 1) % len(features)
                if dataset.feature_transformation is not None:
                    components = dataset.feature_transformer[
                        marker].components_
                    f.write('---feature transformation components---:\n%s' %
                            components.tolist())
                    # if 'feature_mean' in config:
                    #     feature_mean = config['feature_mean']
                    #     coefficients = np.abs(feature_mean*components.sum(axis=0)).\
                    #         reshape([len(dataset.features), -1]).sum(axis=0)
                    # else:
                    #     coefficients = np.abs(components.sum(axis=0)).reshape([len(dataset.features), -1]).sum(axis=0)
                    # coefficients = coefficients / coefficients.sum()
                    #
                    # f.write('---feature transformation coefficients---:\n%s' % coefficients.tolist())

    threshold = config.get('threshold', 'roc_optimal')
    metrics_names = ['sensitivity', 'specificity', 'roc_auc_score']
    metrics_avg_names = ['roc_auc_score_avg', 'roc_auc_score_avg_std']

    fig, ax = plt.subplots(9,
                           len(dataset.markers),
                           squeeze=False,
                           figsize=(6 * len(dataset.markers), 40))
    metrics_file = open(os.path.join(exp_path, 'metrics.txt'), 'w')
    metrics_fig_filename = os.path.join(exp_path, 'conf_mat.png')
    best_params = dict()
    all_marker_train_metrics = []
    all_marker_test_metrics = []
    for i, marker in enumerate(dataset.markers):
        model = get_model(config)
        if 'model_kwargs_search' in config:
            # parameter search
            print('parameter search for marker %s...' % marker)
            all_x, all_y, cv_index = dataset.get_all_data(marker)
            best_model = GridSearchCV(model,
                                      param_grid=config['model_kwargs_search'],
                                      cv=cv_index,
                                      scoring='roc_auc_ovr')
            best_model.fit(all_x, all_y)
            best_params[marker] = best_model.best_params_
            print('search done')
        else:
            best_model = model
            best_params[marker] = config['model_kwargs']

        # run train and test
        train_xs = []
        train_ys = []
        train_ys_score = []
        test_xs = []
        test_ys = []
        test_ys_score = []
        for fold_i, (train_x, train_y, test_x,
                     test_y) in enumerate(dataset.get_split_data(marker)):
            model = base.clone(model)
            model.set_params(**best_params[marker])
            model.fit(train_x, train_y)
            # model.classes_ = dataset.classes
            train_xs += train_x
            train_ys += train_y
            test_xs += test_x
            test_ys += test_y
            train_y_score = model.predict_proba(train_x).tolist()
            train_ys_score += train_y_score
            test_y_score = model.predict_proba(test_x).tolist()
            test_ys_score += test_y_score
            # model_filename = os.path.join(exp_path, 'model', '%s_%s_fold_%d.pkl'
            #                               % (config['model'], marker, fold_i))
            # maybe_create_path(os.path.dirname(model_filename))
            # with open(model_filename, 'wb') as f:
            #     pickle.dump(model, f)

        train_metrics = eval_results(train_ys,
                                     train_ys_score,
                                     labels=dataset.classes,
                                     average='macro',
                                     threshold=threshold,
                                     num_fold=dataset.num_fold)
        test_metrics = eval_results(test_ys,
                                    test_ys_score,
                                    labels=dataset.classes,
                                    average='macro',
                                    threshold=train_metrics['used_threshold'],
                                    num_fold=dataset.num_fold)
        all_marker_train_metrics.append(train_metrics)
        all_marker_test_metrics.append(test_metrics)

        # print metrics to console and file
        double_print('marker: %s' % marker, metrics_file)
        double_print('metrics on training set:', metrics_file)
        for j, class_j in enumerate(dataset.classes):
            log_str = '[class: %s. threshold: %1.1f] ' % (
                class_j, 100 * train_metrics['used_threshold'][j])
            for metrics_name in metrics_names:
                log_str += '%s: %1.1f. ' % (metrics_name,
                                            train_metrics[metrics_name][j])
            double_print(log_str, metrics_file)
        for metrics_name in metrics_avg_names:
            double_print(
                '%s: %1.1f' % (metrics_name, train_metrics[metrics_name]),
                metrics_file)
        double_print('metrics on test set:', metrics_file)
        for j, class_j in enumerate(dataset.classes):
            log_str = '[class: %s. threshold: %1.1f] ' % (
                class_j, 100 * test_metrics['used_threshold'][j])
            for metrics_name in metrics_names:
                log_str += '%s: %1.1f. ' % (metrics_name,
                                            test_metrics[metrics_name][j])
            double_print(log_str, metrics_file)
        for metrics_name in metrics_avg_names:
            double_print(
                '%s: %1.1f' % (metrics_name, test_metrics[metrics_name]),
                metrics_file)

        # generate figure
        current_ax = ax[0, i]
        dataset.plot_data_clean_distribution(current_ax, marker)
        current_ax.set_title('data cleaning on marker %s' % marker)

        current_ax = ax[1, i]
        contour_flag = len(train_xs[0]) == 2
        # dup_reduced = list(tuple(tuple([train_xs[j] + [train_ys[j]] for j in range(len(train_xs))])))
        # dup_reduced_train_xs = [item[:-1] for item in dup_reduced]
        # dup_reduced_train_ys = [item[-1] for item in dup_reduced]
        # dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys]
        dup_reduced_train_xs = train_x + test_x
        dup_reduced_train_ys = train_y + test_y
        dup_reduced_train_ys_str = [str(item) for item in dup_reduced_train_ys]
        classes_str = [str(item) for item in dataset.classes]
        plot_feature_distribution(
            dup_reduced_train_xs,
            ax=current_ax,
            t_sne=True,
            hue=dup_reduced_train_ys_str,
            hue_order=classes_str,
            style=dup_reduced_train_ys_str,
            style_order=classes_str,
            # x_lim='box', y_lim='box',
            x_lim='min_max_extend',
            y_lim='min_max_extend',
            contour=contour_flag,
            z_generator=best_model.predict)
        current_ax.set_title('%s trained on whole set' % marker)

        current_ax = ax[2, i]
        metrics.ConfusionMatrixDisplay(
            train_metrics['conf_mat'],
            display_labels=dataset.classes).plot(ax=current_ax)
        current_ax.set_title('%s on train set of all folds' % marker)

        current_ax = ax[3, i]
        for j in range(len(dataset.classes)):
            roc_curve = train_metrics['roc_curve'][j]
            roc_auc_score = train_metrics['roc_auc_score'][j]
            class_id = dataset.classes[j]
            sen = train_metrics['sensitivity'][j] / 100
            spe = train_metrics['specificity'][j] / 100
            metrics.RocCurveDisplay(fpr=roc_curve[0],
                                    tpr=roc_curve[1],
                                    roc_auc=roc_auc_score,
                                    estimator_name='class %s' %
                                    class_id).plot(ax=current_ax)
            current_ax.scatter(1 - spe, sen)

        current_ax = ax[4, i]
        table_val_list = [
            dataset.classes,
            [100 * item for item in train_metrics['used_threshold']]
        ]
        row_labels = ['cls', 'thr']
        for metrics_name in metrics_names:
            table_val_list.append(train_metrics[metrics_name])
            row_labels.append(metrics_name[:min(3, len(metrics_name))])
        additional_text = []
        for metrics_name in metrics_avg_names:
            additional_text.append('%s: %1.1f' %
                                   (metrics_name, train_metrics[metrics_name]))
        additional_text.append(best_params[marker])
        plot_table(table_val_list,
                   row_labels,
                   ax=current_ax,
                   additional_text=additional_text)

        current_ax = ax[5, i]
        contour_flag = len(train_xs[0]) == 2
        test_y_str = [str(item) for item in test_y]
        classes_str = [str(item) for item in dataset.classes]
        plot_feature_distribution(
            test_x,
            ax=current_ax,
            t_sne=True,
            hue=test_y_str,
            hue_order=classes_str,
            style=test_y_str,
            style_order=classes_str,
            # x_lim='box', y_lim='box',
            x_lim='min_max_extend',
            y_lim='min_max_extend',
            contour=contour_flag,
            z_generator=model.predict)
        current_ax.set_title('%s on test set of the last fold' % marker)

        current_ax = ax[6, i]
        metrics.ConfusionMatrixDisplay(
            test_metrics['conf_mat'],
            display_labels=dataset.classes).plot(ax=current_ax)
        current_ax.set_title('%s on test set of all folds' % marker)

        current_ax = ax[7, i]
        for j in range(len(dataset.classes)):
            roc_curve = test_metrics['roc_curve'][j]
            roc_auc_score = test_metrics['roc_auc_score'][j]
            class_id = dataset.classes[j]
            sen = test_metrics['sensitivity'][j] / 100
            spe = test_metrics['specificity'][j] / 100
            metrics.RocCurveDisplay(fpr=roc_curve[0],
                                    tpr=roc_curve[1],
                                    roc_auc=roc_auc_score,
                                    estimator_name='class %s' %
                                    class_id).plot(ax=current_ax)
            current_ax.scatter(1 - spe, sen)

        current_ax = ax[8, i]
        table_val_list = [
            dataset.classes,
            [100 * item for item in test_metrics['used_threshold']]
        ]
        row_labels = ['cls', 'thr']
        for metrics_name in metrics_names:
            table_val_list.append(test_metrics[metrics_name])
            row_labels.append(metrics_name[:min(3, len(metrics_name))])
        additional_text = []
        for metrics_name in metrics_avg_names:
            additional_text.append('%s: %1.1f' %
                                   (metrics_name, test_metrics[metrics_name]))
        plot_table(table_val_list,
                   row_labels,
                   ax=current_ax,
                   additional_text=additional_text)

    for metrics_name in metrics_avg_names:
        all_marker_values = [
            item[metrics_name] for item in all_marker_train_metrics
        ]
        double_print(
            'overall train %s: %1.1f' %
            (metrics_name, sum(all_marker_values) / len(all_marker_values)),
            metrics_file)
    for metrics_name in metrics_avg_names:
        all_marker_values = [
            item[metrics_name] for item in all_marker_test_metrics
        ]
        double_print(
            'overall test %s: %1.1f' %
            (metrics_name, sum(all_marker_values) / len(all_marker_values)),
            metrics_file)
    metrics_file.close()
    save_yaml(os.path.join(exp_path, 'best_params.yaml'), best_params)
    fig.savefig(metrics_fig_filename, bbox_inches='tight', pad_inches=1)
예제 #8
0
    parser.add_argument(
        "--mask_methods",
        type=str,
        nargs="+",
        default=["commas", "token_mask", "attention_mask", "None"])
    parser.add_argument("--input_dir", type=str, default="./data/adversary")
    parser.add_argument("--output_dir", type=str, default="./data/adversary")
    args = parser.parse_args()

    origin_scores = read_scores(f"{args.input_dir}/None.score.tsv")
    for mask_method in args.mask_methods:
        new_scores = read_scores(f"{args.input_dir}/{mask_method}.score.tsv")
        for key, score in new_scores.items():
            if key in origin_scores:
                origin_scores[key] = score
        temp_score_path = f"{args.output_dir}/temp.{mask_method}.score.tsv"
        assert not os.path.exists(temp_score_path)
        with open(temp_score_path, "w") as outFile:
            for (qid, pid), score in origin_scores.items():
                outFile.write(f"{qid}\t{pid}\t{score}\n")
        output_rank_path = f"{args.output_dir}/{mask_method}.rank.tsv"
        generate_rank(temp_score_path, output_rank_path)
        subprocess.check_call(["rm", temp_score_path])
        mrr = eval_results(output_rank_path)
        abs_output_rank_path = os.path.abspath(output_rank_path)
        rank_with_mrr_path = f"{abs_output_rank_path}.{mrr:.3f}"
        if not os.path.exists(rank_with_mrr_path):
            subprocess.check_call(
                ["ln", "-s", abs_output_rank_path, rank_with_mrr_path])
        print(mask_method, "MRR@10:", mrr)