Пример #1
0
 def get_tokenizer(self, task=None, **kwargs):
     kwargs.update(self.special_tokens_map)
     tokenizer = LukeTokenizer(
         vocab_file=SAMPLE_VOCAB,
         merges_file=SAMPLE_MERGE_FILE,
         entity_vocab_file=SAMPLE_ENTITY_VOCAB,
         task=task,
         **kwargs,
     )
     tokenizer.sanitize_special_tokens()
     return tokenizer
Пример #2
0
    def test_entity_classification_padding_pytorch_tensors(self):
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                                  task="entity_classification",
                                                  return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
        # entity information
        span = (39, 42)

        encoding = tokenizer(sentence,
                             entity_spans=[span],
                             return_token_type_ids=True,
                             padding="max_length",
                             return_tensors="pt")

        # test words
        self.assertEqual(encoding["input_ids"].shape, (1, 512))
        self.assertEqual(encoding["attention_mask"].shape, (1, 512))
        self.assertEqual(encoding["token_type_ids"].shape, (1, 512))

        # test entities
        self.assertEqual(encoding["entity_ids"].shape, (1, 1))
        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 1))
        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 1))
        self.assertEqual(
            encoding["entity_position_ids"].shape,
            (1, tokenizer.max_entity_length, tokenizer.max_mention_length))
Пример #3
0
    def test_entity_classification_no_padding_or_truncation(self):
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                                  task="entity_classification")
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
        span = (39, 42)

        encoding = tokenizer(sentence,
                             entity_spans=[span],
                             return_token_type_ids=True)

        # test words
        self.assertEqual(len(encoding["input_ids"]), 42)
        self.assertEqual(len(encoding["attention_mask"]), 42)
        self.assertEqual(len(encoding["token_type_ids"]), 42)
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"],
                             spaces_between_special_tokens=False),
            "<s>Top seed Ana Ivanovic said on Thursday<ent> she<ent> could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon.</s>",
        )
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][9:12],
                             spaces_between_special_tokens=False),
            "<ent> she<ent>")

        # test entities
        self.assertEqual(encoding["entity_ids"], [2])
        self.assertEqual(encoding["entity_attention_mask"], [1])
        self.assertEqual(encoding["entity_token_type_ids"], [0])
        # fmt: off
        self.assertEqual(encoding["entity_position_ids"], [[
            9, 10, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
        ]])
Пример #4
0
    def test_text_pair_padding_pytorch_tensors(self):
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                                  return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday"
        sentence_pair = "She could hardly believe her luck."
        entities = ["Ana Ivanovic", "Thursday"]
        entities_pair = ["Dummy Entity"]
        spans = [(9, 21), (30, 38)]
        spans_pair = [(0, 3)]

        encoding = tokenizer(
            sentence,
            sentence_pair,
            entities=entities,
            entities_pair=entities_pair,
            entity_spans=spans,
            entity_spans_pair=spans_pair,
            return_token_type_ids=True,
            padding="max_length",
            max_length=30,
            max_entity_length=16,
            return_tensors="pt",
        )

        # test words
        self.assertEqual(encoding["input_ids"].shape, (1, 30))
        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))

        # test entities
        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
        self.assertEqual(encoding["entity_position_ids"].shape,
                         (1, 16, tokenizer.max_mention_length))
Пример #5
0
    def test_inference_large_model(self):
        model = LukeModel.from_pretrained("studio-ousia/luke-large").eval()
        model.to(torch_device)

        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large", task="entity_classification")
        text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
        span = (39, 42)
        encoding = tokenizer(text, entity_spans=[span], add_prefix_space=True, return_tensors="pt")

        # move all values to device
        for key, value in encoding.items():
            encoding[key] = encoding[key].to(torch_device)

        outputs = model(**encoding)

        # Verify word hidden states
        expected_shape = torch.Size((1, 42, 1024))
        self.assertEqual(outputs.last_hidden_state.shape, expected_shape)

        expected_slice = torch.tensor(
            [[0.0133, 0.0865, 0.0095], [0.3093, -0.2576, -0.7418], [-0.1720, -0.2117, -0.2869]]
        ).to(torch_device)
        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))

        # Verify entity hidden states
        expected_shape = torch.Size((1, 1, 1024))
        self.assertEqual(outputs.entity_last_hidden_state.shape, expected_shape)

        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]]).to(torch_device)
        self.assertTrue(torch.allclose(outputs.entity_last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
Пример #6
0
    def test_entity_span_classification_padding_pytorch_tensors(self):
        tokenizer = LukeTokenizer.from_pretrained(
            "studio-ousia/luke-base",
            task="entity_span_classification",
            return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
        spans = [(0, 8), (9, 21), (39, 42)]

        encoding = tokenizer(
            sentence,
            entity_spans=spans,
            return_token_type_ids=True,
            padding="max_length",
            max_length=30,
            max_entity_length=16,
            return_tensors="pt",
        )

        # test words
        self.assertEqual(encoding["input_ids"].shape, (1, 30))
        self.assertEqual(encoding["attention_mask"].shape, (1, 30))
        self.assertEqual(encoding["token_type_ids"].shape, (1, 30))

        # test entities
        self.assertEqual(encoding["entity_ids"].shape, (1, 16))
        self.assertEqual(encoding["entity_attention_mask"].shape, (1, 16))
        self.assertEqual(encoding["entity_token_type_ids"].shape, (1, 16))
        self.assertEqual(encoding["entity_position_ids"].shape,
                         (1, 16, tokenizer.max_mention_length))
        self.assertEqual(encoding["entity_start_positions"].shape, (1, 16))
        self.assertEqual(encoding["entity_end_positions"].shape, (1, 16))
Пример #7
0
    def test_single_text_no_padding_or_truncation(self):
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                                  return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
        entities = ["Ana Ivanovic", "Thursday", "Dummy Entity"]
        spans = [(9, 21), (30, 38), (39, 42)]

        encoding = tokenizer(sentence,
                             entities=entities,
                             entity_spans=spans,
                             return_token_type_ids=True)

        self.assertEqual(
            tokenizer.decode(encoding["input_ids"],
                             spaces_between_special_tokens=False),
            "<s>Top seed Ana Ivanovic said on Thursday she could hardly believe her luck.</s>",
        )
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][3:6],
                             spaces_between_special_tokens=False),
            " Ana Ivanovic")
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][8:9],
                             spaces_between_special_tokens=False), " Thursday")
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][9:10],
                             spaces_between_special_tokens=False), " she")

        self.assertEqual(
            encoding["entity_ids"],
            [
                tokenizer.entity_vocab["Ana Ivanovic"],
                tokenizer.entity_vocab["Thursday"],
                tokenizer.entity_vocab["[UNK]"],
            ],
        )
        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
        # fmt: off
        self.assertEqual(encoding["entity_position_ids"], [
            [
                3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
        ])
Пример #8
0
    def test_text_pair_only_entity_spans_no_padding_or_truncation(self):
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                                  return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday"
        sentence_pair = "She could hardly believe her luck."
        spans = [(9, 21), (30, 38)]
        spans_pair = [(0, 3)]

        encoding = tokenizer(
            sentence,
            sentence_pair,
            entity_spans=spans,
            entity_spans_pair=spans_pair,
            return_token_type_ids=True,
        )

        self.assertEqual(
            tokenizer.decode(encoding["input_ids"],
                             spaces_between_special_tokens=False),
            "<s>Top seed Ana Ivanovic said on Thursday</s></s>She could hardly believe her luck.</s>",
        )
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][3:6],
                             spaces_between_special_tokens=False),
            " Ana Ivanovic")
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][8:9],
                             spaces_between_special_tokens=False), " Thursday")
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][11:12],
                             spaces_between_special_tokens=False), "She")

        mask_id = tokenizer.entity_vocab["[MASK]"]
        self.assertEqual(encoding["entity_ids"], [mask_id, mask_id, mask_id])
        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
        # fmt: off
        self.assertEqual(encoding["entity_position_ids"], [
            [
                3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
        ])
Пример #9
0
    def test_entity_pair_classification_no_padding_or_truncation(self):
        tokenizer = LukeTokenizer.from_pretrained(
            "studio-ousia/luke-base",
            task="entity_pair_classification",
            return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
        # head and tail information
        spans = [(9, 21), (39, 42)]

        encoding = tokenizer(sentence,
                             entity_spans=spans,
                             return_token_type_ids=True)

        self.assertEqual(
            tokenizer.decode(encoding["input_ids"],
                             spaces_between_special_tokens=False),
            "<s>Top seed<ent> Ana Ivanovic<ent> said on Thursday<ent2> she<ent2> could hardly believe her luck.</s>",
        )
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][3:8],
                             spaces_between_special_tokens=False),
            "<ent> Ana Ivanovic<ent>",
        )
        self.assertEqual(
            tokenizer.decode(encoding["input_ids"][11:14],
                             spaces_between_special_tokens=False),
            "<ent2> she<ent2>")

        self.assertEqual(encoding["entity_ids"], [2, 3])
        self.assertEqual(encoding["entity_attention_mask"], [1, 1])
        self.assertEqual(encoding["entity_token_type_ids"], [0, 0])
        # fmt: off
        self.assertEqual(encoding["entity_position_ids"], [
            [
                3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                11, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
        ])
Пример #10
0
    def test_entity_span_classification_no_padding_or_truncation(self):
        tokenizer = LukeTokenizer.from_pretrained(
            "studio-ousia/luke-base",
            task="entity_span_classification",
            return_token_type_ids=True)
        sentence = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck."
        spans = [(0, 8), (9, 21), (39, 42)]

        encoding = tokenizer(sentence,
                             entity_spans=spans,
                             return_token_type_ids=True)

        self.assertEqual(
            tokenizer.decode(encoding["input_ids"],
                             spaces_between_special_tokens=False),
            "<s>Top seed Ana Ivanovic said on Thursday she could hardly believe her luck.</s>",
        )

        self.assertEqual(encoding["entity_ids"], [2, 2, 2])
        self.assertEqual(encoding["entity_attention_mask"], [1, 1, 1])
        self.assertEqual(encoding["entity_token_type_ids"], [0, 0, 0])
        # fmt: off
        self.assertEqual(encoding["entity_position_ids"], [
            [
                1, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                3, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
            [
                9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
            ],
        ])
        # fmt: on
        self.assertEqual(encoding["entity_start_positions"], [1, 3, 9])
        self.assertEqual(encoding["entity_end_positions"], [2, 5, 9])
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
):
    """
    Expected results for ``test.json`` from the Open Entity dataset:
    {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}.

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    tokenizer_kwargs = {"additional_special_tokens": [ENT]}
    reader = EntityTypingReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=True,
            tokenizer_kwargs=tokenizer_kwargs),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntityClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    handler = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(kwargs_handlers=[handler])
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if args.text_column_name is not None:
        text_column_name = args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if args.label_column_name is not None:
        label_column_name = args.label_column_name
    elif f"{args.task_name}_tags" in column_names:
        label_column_name = f"{args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
    num_labels = len(label_list)

    # Map that sends B-Xxx label to its I-Xxx counterpart
    b_to_i_label = []

    for idx, label in enumerate(label_list):
        if label.startswith("B-") and label.replace("B-", "I-") in label_list:
            b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
        else:
            b_to_i_label.append(idx)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = LukeConfig.from_pretrained(args.config_name, num_labels=num_labels)
    elif args.model_name_or_path:
        config = LukeConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
    else:
        logger.warning("You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    tokenizer = LukeTokenizer.from_pretrained(
        tokenizer_name_or_path,
        use_fast=False,
        task="entity_span_classification",
        max_entity_length=args.max_entity_length,
        max_mention_length=args.max_mention_length,
    )

    if args.model_name_or_path:
        model = LukeForEntitySpanClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = LukeForEntitySpanClassification.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    def compute_sentence_boundaries_for_luke(examples):
        sentence_boundaries = []

        for tokens in examples[text_column_name]:
            sentence_boundaries.append([0, len(tokens)])

        examples["sentence_boundaries"] = sentence_boundaries

        return examples

    def compute_entity_spans_for_luke(examples):
        all_entity_spans = []
        texts = []
        all_labels_entity_spans = []
        all_original_entity_spans = []

        for labels, tokens, sentence_boundaries in zip(
            examples[label_column_name], examples[text_column_name], examples["sentence_boundaries"]
        ):
            subword_lengths = [len(tokenizer.tokenize(token)) for token in tokens]
            total_subword_length = sum(subword_lengths)
            _, context_end = sentence_boundaries

            if total_subword_length > args.max_length - 2:
                cur_length = sum(subword_lengths[:context_end])
                idx = context_end - 1

                while cur_length > args.max_length - 2:
                    cur_length -= subword_lengths[idx]
                    context_end -= 1
                    idx -= 1

            text = ""
            sentence_words = tokens[:context_end]
            sentence_subword_lengths = subword_lengths[:context_end]
            word_start_char_positions = []
            word_end_char_positions = []
            labels_positions = {}

            for word, label in zip(sentence_words, labels):
                if word[0] == "'" or (len(word) == 1 and is_punctuation(word)):
                    text = text.rstrip()

                word_start_char_positions.append(len(text))
                text += word
                word_end_char_positions.append(len(text))
                text += " "
                labels_positions[(word_start_char_positions[-1], word_end_char_positions[-1])] = label

            text = text.rstrip()
            texts.append(text)
            entity_spans = []
            labels_entity_spans = []
            original_entity_spans = []

            for word_start in range(len(sentence_words)):
                for word_end in range(word_start, len(sentence_words)):
                    if (
                        sum(sentence_subword_lengths[word_start:word_end]) <= tokenizer.max_mention_length
                        and len(entity_spans) < tokenizer.max_entity_length
                    ):
                        entity_spans.append((word_start_char_positions[word_start], word_end_char_positions[word_end]))
                        original_entity_spans.append((word_start, word_end + 1))
                        if (
                            word_start_char_positions[word_start],
                            word_end_char_positions[word_end],
                        ) in labels_positions:
                            labels_entity_spans.append(
                                labels_positions[
                                    (word_start_char_positions[word_start], word_end_char_positions[word_end])
                                ]
                            )
                        else:
                            labels_entity_spans.append(0)

            all_entity_spans.append(entity_spans)
            all_labels_entity_spans.append(labels_entity_spans)
            all_original_entity_spans.append(original_entity_spans)

        examples["entity_spans"] = all_entity_spans
        examples["text"] = texts
        examples["labels_entity_spans"] = all_labels_entity_spans
        examples["original_entity_spans"] = all_original_entity_spans

        return examples

    def tokenize_and_align_labels(examples):
        entity_spans = []

        for v in examples["entity_spans"]:
            entity_spans.append(list(map(tuple, v)))

        tokenized_inputs = tokenizer(
            examples["text"],
            entity_spans=entity_spans,
            max_length=args.max_length,
            padding=padding,
            truncation=True,
        )

        if padding == "max_length":
            tokenized_inputs["labels"] = padding_tensor(
                examples["labels_entity_spans"], -100, tokenizer.padding_side, tokenizer.max_entity_length
            )
            tokenized_inputs["original_entity_spans"] = padding_tensor(
                examples["original_entity_spans"], (-1, -1), tokenizer.padding_side, tokenizer.max_entity_length
            )
            tokenized_inputs[label_column_name] = padding_tensor(
                examples[label_column_name], -1, tokenizer.padding_side, tokenizer.max_entity_length
            )
        else:
            tokenized_inputs["labels"] = [ex[: tokenizer.max_entity_length] for ex in examples["labels_entity_spans"]]
            tokenized_inputs["original_entity_spans"] = [
                ex[: tokenizer.max_entity_length] for ex in examples["original_entity_spans"]
            ]
            tokenized_inputs[label_column_name] = [
                ex[: tokenizer.max_entity_length] for ex in examples[label_column_name]
            ]

        return tokenized_inputs

    with accelerator.main_process_first():
        raw_datasets = raw_datasets.map(
            compute_sentence_boundaries_for_luke,
            batched=True,
            desc="Adding sentence boundaries",
        )
        raw_datasets = raw_datasets.map(
            compute_entity_spans_for_luke,
            batched=True,
            desc="Adding sentence spans",
        )

        processed_raw_datasets = raw_datasets.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=raw_datasets["train"].column_names,
            desc="Running tokenizer on dataset",
        )

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForLukeTokenClassification(
            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
        )

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Metrics
    metric = load_metric("seqeval")

    def get_luke_labels(outputs, ner_tags, original_entity_spans):
        true_predictions = []
        true_labels = []

        for output, original_spans, tags in zip(outputs.logits, original_entity_spans, ner_tags):
            true_tags = [val for val in tags if val != -1]
            true_original_spans = [val for val in original_spans if val != (-1, -1)]
            max_indices = torch.argmax(output, axis=1)
            max_logits = torch.max(output, axis=1).values
            predictions = []

            for logit, index, span in zip(max_logits, max_indices, true_original_spans):
                if index != 0:
                    predictions.append((logit, span, label_list[index]))

            predicted_sequence = [label_list[0]] * len(true_tags)

            for _, span, label in sorted(predictions, key=lambda o: o[0], reverse=True):
                if all([o == label_list[0] for o in predicted_sequence[span[0] : span[1]]]):
                    predicted_sequence[span[0]] = label
                    if span[1] - span[0] > 1:
                        predicted_sequence[span[0] + 1 : span[1]] = [label] * (span[1] - span[0] - 1)

            true_predictions.append(predicted_sequence)
            true_labels.append([label_list[tag_id] for tag_id in true_tags])

        return true_predictions, true_labels

    def compute_metrics():
        results = metric.compute()
        if args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            _ = batch.pop("original_entity_spans")
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            original_entity_spans = batch.pop("original_entity_spans")
            with torch.no_grad():
                outputs = model(**batch)

            preds, refs = get_luke_labels(outputs, batch[label_column_name], original_entity_spans)

            metric.add_batch(
                predictions=preds,
                references=refs,
            )  # predictions and preferences are expected to be a nested list of labels, not label_ids

        eval_metric = compute_metrics()
        accelerator.print(f"epoch {epoch}:", eval_metric)

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
Пример #13
0
                            '%m/%d/%Y %I:%M:%S %p')
    console = logging.StreamHandler()
    console.setFormatter(fmt)
    logger.addHandler(console)
    logfile = logging.FileHandler(args.log_file, 'a')

    logfile.setFormatter(fmt)
    logger.addHandler(logfile)


    if args.test:
        test_graph = load_data(args.test_file)
        model = torch.load(args.load_model)
        model.device = device 
        #tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
        dev_dataset = QBLINKDataset(test_graph, args, tokenizer, False)
        dev_loader = DataLoader(dataset=dev_dataset,
                              batch_size=1,
                              collate_fn=batcher(device),
                              shuffle=False,
                              num_workers=0)
        model.to(device)
        score, total_list = evaluate(dev_loader, model)
        exit()

    train_graph = load_data(args.train_file)
    dev_graph = load_data(args.dev_file)

    model = Model(args)
    model.device = device
Пример #14
0
def load_rel(model: str, device: str):
    """
    Load LUKE for Relation Extraction model and return its applicable function

    Args:
        model (str): model name to be loaded
        device (str): device info

    Returns:
        function: LUKE-based Relation Extraction function

    """
    print("Loading Relation Extraction Pipeline...")

    try:
        # yapf:disable
        tokenizer = LukeTokenizer.from_pretrained(model)
        model = LukeForEntityPairClassification.from_pretrained(model).to(device)
        # yapf:enable
    except (HTTPError, OSError):
        print("Input model is not supported by HuggingFace Hub")

    def extract_relation(sentences: List) -> List[Tuple]:
        """
        Extraction Relation based on Entity Information

        Args:
            sentence (str): original sentence containing context
            head_entity (Dict): head entity containing position information
            tail_entity (Dict): tail entity containing position information

        Returns:
            List[Tuple]: list of (head_entity, relation, tail_entity) formatted triples

        """
        triples = list()

        # TODO: batchify
        for sentence in sentences:
            tokens = tokenizer(
                sentence["text"],
                entity_spans=[
                    (sentence["spans"][0][0], sentence["spans"][0][-1]),
                    (sentence["spans"][-1][0], sentence["spans"][-1][-1]),
                ],
                return_tensors="pt",
            ).to(device)
            outputs = model(**tokens)
            predicted_id = int(outputs.logits[0].argmax())
            relation = model.config.id2label[predicted_id]

            if relation != "no_relation":
                triples.append((
                    sentence["text"]
                    [sentence["spans"][0][0]:sentence["spans"][0][-1]],
                    relation,
                    sentence["text"]
                    [sentence["spans"][-1][0]:sentence["spans"][-1][-1]],
                ))

        return triples

    return extract_relation
Пример #15
0
import os
from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification, LukeForEntitySpanClassification

###
# LUKE models in the following examples will be saved to cache_dir=../../models
#
# studio-ousia/luke-base
# studio-ousia/luke-large-finetuned-tacred
# studio-ousia/luke-large-finetuned-conll-2003
###

model = LukeModel.from_pretrained("studio-ousia/luke-base",
                                  cache_dir=os.getenv("cache_dir",
                                                      "../../models"))
tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base",
                                          cache_dir=os.getenv(
                                              "cache_dir", "../../models"))

# Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
text = "Beyoncé lives in Los Angeles."
entity_spans = [(0, 7)
                ]  # character-based entity span corresponding to "Beyoncé"
inputs = tokenizer(text,
                   entity_spans=entity_spans,
                   add_prefix_space=True,
                   return_tensors="pt")
outputs = model(**inputs)
word_last_hidden_state = outputs.last_hidden_state
entity_last_hidden_state = outputs.entity_last_hidden_state

# Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
Пример #16
0
def convert_luke_checkpoint(checkpoint_path, metadata_path, entity_vocab_path,
                            pytorch_dump_folder_path, model_size):
    # Load configuration defined in the metadata file
    with open(metadata_path) as metadata_file:
        metadata = json.load(metadata_file)
    config = LukeConfig(use_entity_aware_attention=True,
                        **metadata["model_config"])

    # Load in the weights from the checkpoint_path
    state_dict = torch.load(checkpoint_path, map_location="cpu")

    # Load the entity vocab file
    entity_vocab = load_entity_vocab(entity_vocab_path)

    tokenizer = RobertaTokenizer.from_pretrained(
        metadata["model_config"]["bert_model_name"])

    # Add special tokens to the token vocabulary for downstream tasks
    entity_token_1 = AddedToken("<ent>", lstrip=False, rstrip=False)
    entity_token_2 = AddedToken("<ent2>", lstrip=False, rstrip=False)
    tokenizer.add_special_tokens(
        dict(additional_special_tokens=[entity_token_1, entity_token_2]))
    config.vocab_size += 2

    print(f"Saving tokenizer to {pytorch_dump_folder_path}")
    tokenizer.save_pretrained(pytorch_dump_folder_path)
    with open(
            os.path.join(pytorch_dump_folder_path,
                         LukeTokenizer.vocab_files_names["entity_vocab_file"]),
            "w") as f:
        json.dump(entity_vocab, f)

    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path)

    # Initialize the embeddings of the special tokens
    word_emb = state_dict["embeddings.word_embeddings.weight"]
    ent_emb = word_emb[tokenizer.convert_tokens_to_ids(["@"])[0]].unsqueeze(0)
    ent2_emb = word_emb[tokenizer.convert_tokens_to_ids(["#"])[0]].unsqueeze(0)
    state_dict["embeddings.word_embeddings.weight"] = torch.cat(
        [word_emb, ent_emb, ent2_emb])

    # Initialize the query layers of the entity-aware self-attention mechanism
    for layer_index in range(config.num_hidden_layers):
        for matrix_name in ["query.weight", "query.bias"]:
            prefix = f"encoder.layer.{layer_index}.attention.self."
            state_dict[prefix + "w2e_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]
            state_dict[prefix + "e2w_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]
            state_dict[prefix + "e2e_" + matrix_name] = state_dict[prefix +
                                                                   matrix_name]

    # Initialize the embedding of the [MASK2] entity using that of the [MASK] entity for downstream tasks
    entity_emb = state_dict["entity_embeddings.entity_embeddings.weight"]
    entity_emb[entity_vocab["[MASK2]"]] = entity_emb[entity_vocab["[MASK]"]]

    model = LukeModel(config=config).eval()

    missing_keys, unexpected_keys = model.load_state_dict(state_dict,
                                                          strict=False)
    if not (len(missing_keys) == 1
            and missing_keys[0] == "embeddings.position_ids"):
        raise ValueError(
            f"Missing keys {', '.join(missing_keys)}. Expected only missing embeddings.position_ids"
        )
    if not (all(
            key.startswith("entity_predictions") or key.startswith("lm_head")
            for key in unexpected_keys)):
        raise ValueError(
            f"Unexpected keys {', '.join([key for key in unexpected_keys if not (key.startswith('entity_predictions') or key.startswith('lm_head'))])}"
        )

    # Check outputs
    tokenizer = LukeTokenizer.from_pretrained(pytorch_dump_folder_path,
                                              task="entity_classification")

    text = "Top seed Ana Ivanovic said on Thursday she could hardly believe her luck as a fortuitous netcord helped the new world number one avoid a humiliating second- round exit at Wimbledon ."
    span = (39, 42)
    encoding = tokenizer(text,
                         entity_spans=[span],
                         add_prefix_space=True,
                         return_tensors="pt")

    outputs = model(**encoding)

    # Verify word hidden states
    if model_size == "large":
        expected_shape = torch.Size((1, 42, 1024))
        expected_slice = torch.tensor([[0.0133, 0.0865, 0.0095],
                                       [0.3093, -0.2576, -0.7418],
                                       [-0.1720, -0.2117, -0.2869]])
    else:  # base
        expected_shape = torch.Size((1, 42, 768))
        expected_slice = torch.tensor([[0.0037, 0.1368, -0.0091],
                                       [0.1099, 0.3329, -0.1095],
                                       [0.0765, 0.5335, 0.1179]])

    if not (outputs.last_hidden_state.shape == expected_shape):
        raise ValueError(
            f"Outputs.last_hidden_state.shape is {outputs.last_hidden_state.shape}, Expected shape is {expected_shape}"
        )
    if not torch.allclose(
            outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4):
        raise ValueError

    # Verify entity hidden states
    if model_size == "large":
        expected_shape = torch.Size((1, 1, 1024))
        expected_slice = torch.tensor([[0.0466, -0.0106, -0.0179]])
    else:  # base
        expected_shape = torch.Size((1, 1, 768))
        expected_slice = torch.tensor([[0.1457, 0.1044, 0.0174]])

    if not (outputs.entity_last_hidden_state.shape != expected_shape):
        raise ValueError(
            f"Outputs.entity_last_hidden_state.shape is {outputs.entity_last_hidden_state.shape}, Expected shape is {expected_shape}"
        )
    if not torch.allclose(outputs.entity_last_hidden_state[0, :3, :3],
                          expected_slice,
                          atol=1e-4):
        raise ValueError

    # Finally, save our PyTorch model and tokenizer
    print("Saving PyTorch model to {}".format(pytorch_dump_folder_path))
    model.save_pretrained(pytorch_dump_folder_path)
Пример #17
0
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
    prediction_save_path: str,
):
    """
    Expected results for CoNLL-2003 NER English test set.
    {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929}

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    reader = ConllSpanReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=False,
            tokenizer_kwargs={"add_prefix_space": True}),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntitySpanClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    labels = ["O" if l == "NIL" else l for l in labels]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    if prediction_save_path is not None:
        params["prediction_save_path"] = prediction_save_path
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)