def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
):
    """
    Expected results for ``test.json`` from the Open Entity dataset:
    {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}.

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    tokenizer_kwargs = {"additional_special_tokens": [ENT]}
    reader = EntityTypingReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=True,
            tokenizer_kwargs=tokenizer_kwargs),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntityClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)
示例#2
0
文件: mlm.py 项目: lgessler/embur
from typing import Tuple

import torch
from allennlp.data import Vocabulary, Instance, Token
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.nn.util import get_token_ids_from_text_field_tensors
from transformers import BertTokenizer, DataCollatorForWholeWordMask

tokenizer = BertTokenizer.from_pretrained('./bert_out')
vocab = Vocabulary(non_padded_namespaces=["tokens"])
vocab.add_transformer_vocab(tokenizer, "tokens")

vocab.get_token_index("[PAD]", "tokens")


idx = PretrainedTransformerMismatchedIndexer("./bert_out", namespace="tokens")
def prepare_instance(s):
    tokens = [Token(t) for t in s.split(" ")]
    indexed = idx.tokens_to_indices(tokens, vocab)
    print([vocab.get_token_from_index(i) for i in indexed['token_ids']])
    return Instance({"tokens": TextField(tokens, {"tokens": idx})})

instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")]
for i in instances:
    i["tokens"].index(vocab)

tensors = [i.as_tensor_dict() for i in instances]

collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0),
示例#3
0
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
    prediction_save_path: str,
):
    """
    Expected results for CoNLL-2003 NER English test set.
    {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929}

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    reader = ConllSpanReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=False,
            tokenizer_kwargs={"add_prefix_space": True}),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntitySpanClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    labels = ["O" if l == "NIL" else l for l in labels]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    if prediction_save_path is not None:
        params["prediction_save_path"] = prediction_save_path
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)