Exemplo n.º 1
0
 def create_and_check_bert_model_for_masked_lm_as_decoder(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels,
         encoder_hidden_states, encoder_attention_mask):
     model = BertForMaskedLM(config=config)
     model.eval()
     loss, prediction_scores = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         masked_lm_labels=token_labels,
         encoder_hidden_states=encoder_hidden_states,
         encoder_attention_mask=encoder_attention_mask)
     loss, prediction_scores = model(
         input_ids,
         attention_mask=input_mask,
         token_type_ids=token_type_ids,
         masked_lm_labels=token_labels,
         encoder_hidden_states=encoder_hidden_states)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
 def create_and_check_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = BertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Exemplo n.º 3
0
 def create_and_check_bert_for_masked_lm(self, config, input_ids,
                                         token_type_ids, input_mask,
                                         sequence_labels, token_labels,
                                         choice_labels):
     model = BertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids,
                    attention_mask=input_mask,
                    token_type_ids=token_type_ids,
                    labels=token_labels)
     self.parent.assertListEqual(
         list(result["logits"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
        def create_and_check_bert_for_masked_lm(self, config, input_ids,
                                                token_type_ids, input_mask,
                                                sequence_labels, token_labels,
                                                choice_labels):
            model = BertForMaskedLM(config=config)
            model.eval()
            loss, prediction_scores = model(input_ids,
                                            attention_mask=input_mask,
                                            token_type_ids=token_type_ids,
                                            masked_lm_labels=token_labels)

            #####
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc
            ], [self.loss_desc, self.prediction_scores_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)
            model(input_ids,
                  attention_mask=input_mask,
                  token_type_ids=token_type_ids,
                  masked_lm_labels=token_labels)
Exemplo n.º 5
0
def convert_bort_checkpoint_to_pytorch(bort_checkpoint_path: str, pytorch_dump_folder_path: str):
    """
    Convert the original Bort checkpoint (based on MXNET and Gluonnlp) to our BERT structure-
    """

    # Original Bort configuration
    bort_4_8_768_1024_hparams = {
        "attention_cell": "multi_head",
        "num_layers": 4,
        "units": 1024,
        "hidden_size": 768,
        "max_length": 512,
        "num_heads": 8,
        "scaled": True,
        "dropout": 0.1,
        "use_residual": True,
        "embed_size": 1024,
        "embed_dropout": 0.1,
        "word_embed": None,
        "layer_norm_eps": 1e-5,
        "token_type_vocab_size": 2,
    }

    predefined_args = bort_4_8_768_1024_hparams

    # Let's construct the original Bort model here
    # Taken from official BERT implementation, see:
    # https://github.com/alexa/bort/blob/master/bort/bort.py
    encoder = BERTEncoder(
        attention_cell=predefined_args["attention_cell"],
        num_layers=predefined_args["num_layers"],
        units=predefined_args["units"],
        hidden_size=predefined_args["hidden_size"],
        max_length=predefined_args["max_length"],
        num_heads=predefined_args["num_heads"],
        scaled=predefined_args["scaled"],
        dropout=predefined_args["dropout"],
        output_attention=False,
        output_all_encodings=False,
        use_residual=predefined_args["use_residual"],
        activation=predefined_args.get("activation", "gelu"),
        layer_norm_eps=predefined_args.get("layer_norm_eps", None),
    )

    # Vocab information needs to be fetched first
    # It's the same as RoBERTa, so RobertaTokenizer can be used later
    vocab_name = "openwebtext_ccnews_stories_books_cased"

    # Specify download folder to Gluonnlp's vocab
    gluon_cache_dir = os.path.join(get_home_dir(), "models")
    bort_vocab = _load_vocab(vocab_name, None, gluon_cache_dir, cls=Vocab)

    original_bort = nlp.model.BERTModel(
        encoder,
        len(bort_vocab),
        units=predefined_args["units"],
        embed_size=predefined_args["embed_size"],
        embed_dropout=predefined_args["embed_dropout"],
        word_embed=predefined_args["word_embed"],
        use_pooler=False,
        use_token_type_embed=False,
        token_type_vocab_size=predefined_args["token_type_vocab_size"],
        use_classifier=False,
        use_decoder=False,
    )

    original_bort.load_parameters(bort_checkpoint_path, cast_dtype=True, ignore_extra=True)
    params = original_bort._collect_params_with_prefix()

    # Build our config 🤗
    hf_bort_config_json = {
        "architectures": ["BertForMaskedLM"],
        "attention_probs_dropout_prob": predefined_args["dropout"],
        "hidden_act": "gelu",
        "hidden_dropout_prob": predefined_args["dropout"],
        "hidden_size": predefined_args["embed_size"],
        "initializer_range": 0.02,
        "intermediate_size": predefined_args["hidden_size"],
        "layer_norm_eps": predefined_args["layer_norm_eps"],
        "max_position_embeddings": predefined_args["max_length"],
        "model_type": "bort",
        "num_attention_heads": predefined_args["num_heads"],
        "num_hidden_layers": predefined_args["num_layers"],
        "pad_token_id": 1,  # 2 = BERT, 1 = RoBERTa
        "type_vocab_size": 1,  # 2 = BERT, 1 = RoBERTa
        "vocab_size": len(bort_vocab),
    }

    hf_bort_config = BertConfig.from_dict(hf_bort_config_json)
    hf_bort_model = BertForMaskedLM(hf_bort_config)
    hf_bort_model.eval()

    # Parameter mapping table (Gluonnlp to Transformers)
    # * denotes layer index
    #
    # | Gluon Parameter                                                | Transformers Parameter
    # | -------------------------------------------------------------- | ----------------------
    # | `encoder.layer_norm.beta`                                      | `bert.embeddings.LayerNorm.bias`
    # | `encoder.layer_norm.gamma`                                     | `bert.embeddings.LayerNorm.weight`
    # | `encoder.position_weight`                                      | `bert.embeddings.position_embeddings.weight`
    # | `word_embed.0.weight`                                          | `bert.embeddings.word_embeddings.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_key.bias`     | `bert.encoder.layer.*.attention.self.key.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_key.weight`   | `bert.encoder.layer.*.attention.self.key.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_query.bias`   | `bert.encoder.layer.*.attention.self.query.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_query.weight` | `bert.encoder.layer.*.attention.self.query.weight`
    # | `encoder.transformer_cells.*.attention_cell.proj_value.bias`   | `bert.encoder.layer.*.attention.self.value.bias`
    # | `encoder.transformer_cells.*.attention_cell.proj_value.weight` | `bert.encoder.layer.*.attention.self.value.weight`
    # | `encoder.transformer_cells.*.ffn.ffn_2.bias`                   | `bert.encoder.layer.*.attention.output.dense.bias`
    # | `encoder.transformer_cells.*.ffn.ffn_2.weight`                 | `bert.encoder.layer.*.attention.output.dense.weight`
    # | `encoder.transformer_cells.*.layer_norm.beta`                  | `bert.encoder.layer.*.attention.output.LayerNorm.bias`
    # | `encoder.transformer_cells.*.layer_norm.gamma`                 | `bert.encoder.layer.*.attention.output.LayerNorm.weight`
    # | `encoder.transformer_cells.*.ffn.ffn_1.bias`                   | `bert.encoder.layer.*.intermediate.dense.bias`
    # | `encoder.transformer_cells.*.ffn.ffn_1.weight`                 | `bert.encoder.layer.*.intermediate.dense.weight`
    # | `encoder.transformer_cells.*.ffn.layer_norm.beta`              | `bert.encoder.layer.*.output.LayerNorm.bias`
    # | `encoder.transformer_cells.*.ffn.layer_norm.gamma`             | `bert.encoder.layer.*.output.LayerNorm.weight`
    # | `encoder.transformer_cells.*.proj.bias`                        | `bert.encoder.layer.*.output.dense.bias`
    # | `encoder.transformer_cells.*.proj.weight`                      | `bert.encoder.layer.*.output.dense.weight`

    # Helper function to convert MXNET Arrays to PyTorch
    def to_torch(mx_array) -> torch.nn.Parameter:
        return torch.nn.Parameter(torch.FloatTensor(mx_array.data().asnumpy()))

    # Check param shapes and map new HF param back
    def check_and_map_params(hf_param, gluon_param):
        shape_hf = hf_param.shape

        gluon_param = to_torch(params[gluon_param])
        shape_gluon = gluon_param.shape

        assert (
            shape_hf == shape_gluon
        ), f"The gluon parameter {gluon_param} has shape {shape_gluon}, but expects shape {shape_hf} for Transformers"

        return gluon_param

    hf_bort_model.bert.embeddings.word_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.word_embeddings.weight, "word_embed.0.weight"
    )
    hf_bort_model.bert.embeddings.position_embeddings.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.position_embeddings.weight, "encoder.position_weight"
    )
    hf_bort_model.bert.embeddings.LayerNorm.bias = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.bias, "encoder.layer_norm.beta"
    )
    hf_bort_model.bert.embeddings.LayerNorm.weight = check_and_map_params(
        hf_bort_model.bert.embeddings.LayerNorm.weight, "encoder.layer_norm.gamma"
    )

    # Inspired by RoBERTa conversion script, we just zero them out (Bort does not use them)
    hf_bort_model.bert.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
        hf_bort_model.bert.embeddings.token_type_embeddings.weight.data
    )

    for i in range(hf_bort_config.num_hidden_layers):
        layer: BertLayer = hf_bort_model.bert.encoder.layer[i]

        # self attention
        self_attn: BertSelfAttention = layer.attention.self

        self_attn.key.bias.data = check_and_map_params(
            self_attn.key.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.bias"
        )

        self_attn.key.weight.data = check_and_map_params(
            self_attn.key.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_key.weight"
        )
        self_attn.query.bias.data = check_and_map_params(
            self_attn.query.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.bias"
        )
        self_attn.query.weight.data = check_and_map_params(
            self_attn.query.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_query.weight"
        )
        self_attn.value.bias.data = check_and_map_params(
            self_attn.value.bias.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.bias"
        )
        self_attn.value.weight.data = check_and_map_params(
            self_attn.value.weight.data, f"encoder.transformer_cells.{i}.attention_cell.proj_value.weight"
        )

        # self attention output
        self_output: BertSelfOutput = layer.attention.output

        self_output.dense.bias = check_and_map_params(
            self_output.dense.bias, f"encoder.transformer_cells.{i}.proj.bias"
        )
        self_output.dense.weight = check_and_map_params(
            self_output.dense.weight, f"encoder.transformer_cells.{i}.proj.weight"
        )
        self_output.LayerNorm.bias = check_and_map_params(
            self_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.layer_norm.beta"
        )
        self_output.LayerNorm.weight = check_and_map_params(
            self_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.layer_norm.gamma"
        )

        # intermediate
        intermediate: BertIntermediate = layer.intermediate

        intermediate.dense.bias = check_and_map_params(
            intermediate.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_1.bias"
        )
        intermediate.dense.weight = check_and_map_params(
            intermediate.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_1.weight"
        )

        # output
        bert_output: BertOutput = layer.output

        bert_output.dense.bias = check_and_map_params(
            bert_output.dense.bias, f"encoder.transformer_cells.{i}.ffn.ffn_2.bias"
        )
        bert_output.dense.weight = check_and_map_params(
            bert_output.dense.weight, f"encoder.transformer_cells.{i}.ffn.ffn_2.weight"
        )
        bert_output.LayerNorm.bias = check_and_map_params(
            bert_output.LayerNorm.bias, f"encoder.transformer_cells.{i}.ffn.layer_norm.beta"
        )
        bert_output.LayerNorm.weight = check_and_map_params(
            bert_output.LayerNorm.weight, f"encoder.transformer_cells.{i}.ffn.layer_norm.gamma"
        )

    # Save space and energy 🎄
    hf_bort_model.half()

    # Compare output of both models
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

    input_ids = tokenizer.encode_plus(SAMPLE_TEXT)["input_ids"]

    # Get gluon output
    gluon_input_ids = mx.nd.array([input_ids])
    output_gluon = original_bort(inputs=gluon_input_ids, token_types=[])

    # Get Transformer output (save and reload model again)
    hf_bort_model.save_pretrained(pytorch_dump_folder_path)
    hf_bort_model = BertModel.from_pretrained(pytorch_dump_folder_path)
    hf_bort_model.eval()

    input_ids = tokenizer.encode_plus(SAMPLE_TEXT, return_tensors="pt")
    output_hf = hf_bort_model(**input_ids)[0]

    gluon_layer = output_gluon[0].asnumpy()
    hf_layer = output_hf[0].detach().numpy()

    max_absolute_diff = np.max(np.abs(hf_layer - gluon_layer)).item()
    success = np.allclose(gluon_layer, hf_layer, atol=1e-3)

    if success:
        print("✔️ Both model do output the same tensors")
    else:
        print("❌ Both model do **NOT** output the same tensors")
        print("Absolute difference is:", max_absolute_diff)
Exemplo n.º 6
0
def get_word_probabilities(
    sentence: str, bert_model: BertForMaskedLM, bert_tokenizer: BertTokenizer
) -> Tuple[Tuple[str, Tuple[str, ...], Tuple[float, ...]]]:
    """
    Returns the probability of each word in a sentence.
    Returns a sequence of subtokens and their probabilities.

    :param sentence: A sentence providing context for the word, max tokens 512.
    :param bert_model: an instance of BertForMaskedLM (preferably cased, large)
    :param bert_tokenizer: a BertTokenizer (preferably cased, large)
    :return: a Tuple of values: original token string, word as subtokens, subtoken ids

    # Doctest skipped because OOME on circleci medium image :-(
    >>> from transformers import BertTokenizer, BertForMaskedLM  # doctest: +SKIP
    >>> bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased') # doctest: +SKIP
    >>> bert_model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") # doctest: +SKIP
    >>> _ = bert_model.eval() # doctest: +SKIP
    >>> get_word_probabilities(sentence="I am psychologist.", # doctest: +SKIP
    ... bert_tokenizer=bert_tokenizer, bert_model=bert_model)
    (('I', ('I',), (0.9858765006065369,)), ('am', ('am',), (0.6945590376853943,)), \
('psychologist', ('psychologist',), (4.13914813179872e-06,)), \
('.', ('.',), (0.8961634635925293,)))

    """
    whole_tokens = word_tokenize(sentence)
    bert_token_map = {
        idx: bert_tokenizer.encode(whole_token, add_special_tokens=False)
        for idx, whole_token in enumerate(whole_tokens)
    }
    start_token_id = bert_tokenizer.encode("[CLS]", add_special_tokens=False)
    end_token_id = bert_tokenizer.encode("[SEP]", add_special_tokens=False)
    total_tokens = len(list(chain.from_iterable(bert_token_map.values())))
    if total_tokens > 510:
        LOG.warning("Too many tokens, should be 510 or less, found %s",
                    total_tokens)
    LOG.debug("# bert tokens: %s # whole tokens: %s", total_tokens,
              len(whole_tokens))
    torch.set_grad_enabled(False)
    word_probas = []
    softmax = torch.nn.Softmax(dim=1)
    for idx in bert_token_map:
        LOG.debug("idx %s", idx)
        bert_model.eval()
        with torch.no_grad():
            tmp_token_map = deepcopy(bert_token_map)
            curr_slot_len = len(tmp_token_map[idx])
            tmp_token_map[idx] = bert_tokenizer.encode(
                " [MASK] " * curr_slot_len, add_special_tokens=False)
            the_tokens = list(chain.from_iterable(tmp_token_map.values()))
            indexed_tokens = list(start_token_id) + the_tokens + list(
                end_token_id)
            LOG.debug("index tokens %s",
                      ",".join([str(tmp) for tmp in indexed_tokens]))
            # pylint: disable=not-callable,no-member
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor(
                [torch.zeros(len(indexed_tokens),
                             dtype=int).tolist()]  # type: ignore
            )
            outputs = bert_model(tokens_tensor,
                                 token_type_ids=segments_tensors)
            predictions = softmax(outputs[0].squeeze(0))
        # get true index for predictions
        starting_position = (len(
            list(
                chain.from_iterable([
                    vals for key, vals in bert_token_map.items() if key < idx
                ]))) + 1)
        if curr_slot_len > 1:
            subtokens = []
            probas = []
            for col, orig_token_id in enumerate(bert_token_map[idx]):
                orig_word_proba = predictions[starting_position +
                                              col][orig_token_id].item()
                subtokens.append(
                    bert_tokenizer.convert_ids_to_tokens(orig_token_id))
                probas.append(orig_word_proba)
                LOG.debug(
                    "token %s %s %s",
                    bert_tokenizer.convert_ids_to_tokens(orig_token_id),
                    col,
                    orig_word_proba,
                )
            word_probas.append(
                (whole_tokens[idx], tuple(subtokens), tuple(probas)))
        else:
            orig_token_id = bert_token_map[idx][0]
            orig_word_proba = predictions[starting_position][
                orig_token_id].item()
            word_probas.append((
                whole_tokens[idx],
                tuple(bert_tokenizer.convert_ids_to_tokens([orig_token_id])),
                tuple([orig_word_proba]),
            ))
    return tuple(word_probas)  # type: ignore
Exemplo n.º 7
0
def get_word_in_sentence_probability(
    sentence: str,
    word: str,
    bert_model: BertForMaskedLM,
    bert_tokenizer: BertTokenizer,
    word_index: int = -1,
) -> Tuple[Tuple[str, float], ...]:
    """
    Given a sentence, and a slot, determine what the probability is for a given word.
    Reports subword tokenization probabilities.

    :param sentence: A sentence providing context for the word, max tokens 512.
    :param word: the word for which you would like the probability; if it's not in the sentence
    you provide, you will have to also pass a word_index argument.
    :param bert_model: an instance of BertForMaskedLM (preferably cased, large)
    :param bert_tokenizer: a BertTokenizer (preferably cased, large)
    :param word_index: The location in the sentence for which you would like the probability of
     the `word` parameter. Zero-based index of the words.
    :return: a tuple of tuples of (token:str, probability:float) and the probability value
    represents the softmax value.

    # Doctest skipped because OOME on circleci medium image :-(
    >>> from transformers import BertTokenizer, BertForMaskedLM # doctest: +SKIP
    >>> bert_tokenizer = BertTokenizer.from_pretrained('bert-large-cased') # doctest: +SKIP
    >>> bert_model = BertForMaskedLM.from_pretrained("bert-large-cased-whole-word-masking") # doctest: +SKIP
    >>> _ = bert_model.eval() # doctest: +SKIP
    >>> get_word_in_sentence_probability(sentence="Yoga brings peace and vitality to you life.", # doctest: +SKIP
    ... word='your', bert_model=bert_model, bert_tokenizer=bert_tokenizer, word_index=6)
    (0.004815567284822464,)

    """
    whole_tokens = word_tokenize(sentence)
    if word_index == -1:
        word_index = whole_tokens.index(word)
    bert_token_map = {
        idx: bert_tokenizer.encode(whole_token, add_special_tokens=False)
        for idx, whole_token in enumerate(whole_tokens)
    }  # type: Dict[int,List[int]]
    mask_token_id = bert_tokenizer.encode("[MASK]", add_special_tokens=False)
    tokens_to_predict = bert_tokenizer.encode(word, add_special_tokens=False)
    bert_token_map[word_index] = mask_token_id * len(
        tokens_to_predict)  # type: ignore
    LOG.debug(
        "total bert tokens: %s whole tokens: %s",
        len(list(chain.from_iterable(bert_token_map.values()))),
        len(whole_tokens),
    )
    torch.set_grad_enabled(False)
    bert_model.eval()
    # to find the true index of the desired word; count all of the tokens and subtokens before
    starting_position = (
        len(
            list(
                chain.from_iterable([
                    vals
                    for key, vals in bert_token_map.items() if key < word_index
                ]  # type : ignore
                                    ))) + 1)
    start_token_id = bert_tokenizer.encode("[CLS]", add_special_tokens=False)
    end_token_id = bert_tokenizer.encode("[SEP]", add_special_tokens=False)
    the_tokens = list(chain.from_iterable(bert_token_map.values()))
    LOG.debug(bert_tokenizer.convert_ids_to_tokens(the_tokens))
    indexed_tokens = list(start_token_id) + the_tokens + list(end_token_id)
    softmax = torch.nn.Softmax(dim=1)
    with torch.no_grad():
        # pylint: disable=not-callable,no-member
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor(
            [torch.zeros(len(indexed_tokens),
                         dtype=int).tolist()]  # type: ignore
        )
        outputs = bert_model(tokens_tensor, token_type_ids=segments_tensors)
        predictions = softmax(outputs[0].squeeze(0))
    if len(tokens_to_predict) == 1:
        return tuple([
            predictions[starting_position][tokens_to_predict].item()
        ])  # type: ignore
    return tuple([
        (
            bert_tokenizer.convert_ids_to_tokens(tmp),
            predictions[starting_position + idx][tmp].item(),  # type: ignore
        ) for idx, tmp in enumerate(tokens_to_predict)
    ])  # type: ignore
Exemplo n.º 8
0
class Prober():
    def __init__(self, args, random_init='none'):
        assert (random_init in ['none', 'all', 'embedding'])

        super().__init__()

        self._model_device = 'cpu'

        model_name = args.model_name
        vocab_name = model_name

        if args.model_dir is not None:
            # load bert model from file
            model_name = str(args.model_dir) + "/"
            vocab_name = model_name
            logger.info("loading BERT model from {}".format(model_name))

        # Load pre-trained model tokenizer (vocabulary)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
        if torch.cuda.device_count() > 1:
            torch.cuda.manual_seed_all(args.seed)

        config = AutoConfig.from_pretrained(model_name)
        if isinstance(config, AlbertConfig):
            self.model_type = 'albert'
            self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = AlbertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = AlbertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.albert
        elif isinstance(config, RobertaConfig):
            self.model_type = 'roberta'
            self.tokenizer = RobertaTokenizer.from_pretrained(vocab_name)
            self.mlm_model = RobertaForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = RobertaForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.roberta
        elif isinstance(config, BertConfig):
            self.model_type = 'bert'
            self.tokenizer = BertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = BertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = BertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.bert
        else:
            raise ValueError('Model %s not supported yet!' % (model_name))

        self.mlm_model.eval()

        if random_init == 'embedding':
            logger.info('Random initialize embedding layer...')
            self.mlm_model._init_weights(
                self.base_model.embeddings.word_embeddings)

        # original vocab
        self.map_indices = None
        self.vocab = list(self.tokenizer.get_vocab().keys())
        logger.info('Vocab size: %d' % len(self.vocab))
        self._init_inverse_vocab()

        self.MASK = self.tokenizer.mask_token
        self.EOS = self.tokenizer.eos_token
        self.CLS = self.tokenizer.cls_token
        self.SEP = self.tokenizer.sep_token
        self.UNK = self.tokenizer.unk_token
        # print(self.MASK, self.EOS, self.CLS, self.SEP, self.UNK)

        self.pad_id = self.inverse_vocab[self.tokenizer.pad_token]
        self.unk_index = self.inverse_vocab[self.tokenizer.unk_token]

        # used to output top-k predictions
        self.k = args.k

    def _cuda(self):
        self.mlm_model.cuda()

    def try_cuda(self):
        """Move model to GPU if one is available."""
        if torch.cuda.is_available():
            if self._model_device != 'cuda':
                logger.info('Moving model to CUDA')
                self._cuda()
                self._model_device = 'cuda'
        else:
            logger.info('No CUDA found')

    def init_indices_for_filter_logprobs(self, vocab_subset, logger=None):
        index_list = []
        new_vocab_subset = []
        for word in vocab_subset:
            tokens = self.tokenizer.tokenize(' ' + word)
            if (len(tokens) == 1) and (tokens[0] != self.UNK):
                index_list.append(
                    self.tokenizer.convert_tokens_to_ids(tokens)[0])
                new_vocab_subset.append(word)
            else:
                msg = "word {} from vocab_subset not in model vocabulary!".format(
                    word)
                if logger is not None:
                    logger.warning(msg)
                else:
                    logger.info("WARNING: {}".format(msg))

        indices = torch.as_tensor(index_list)
        return indices, index_list

    def _init_inverse_vocab(self):
        self.inverse_vocab = {w: i for i, w in enumerate(self.vocab)}

    def get_id(self, string):
        tokenized_text = self.tokenizer.tokenize(string)
        indexed_string = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        if self.map_indices is not None:
            # map indices to subset of the vocabulary
            indexed_string = self.convert_ids(indexed_string)

        return indexed_string

    def _get_input_tensors_batch_train(self, sentences_list, samples_list):
        tokens_tensors_list = []
        segments_tensors_list = []
        masked_indices_list = []
        tokenized_text_list = []
        mlm_labels_tensor_list = []
        mlm_label_ids = []

        max_tokens = 0
        for (sentences, samples) in zip(sentences_list, samples_list):
            tokens_tensor, segments_tensor, masked_indices, tokenized_text, mlm_labels_tensor, mlm_label_id = self.__get_input_tensors(
                sentences, mlm_label=samples['obj_label'])
            tokens_tensors_list.append(tokens_tensor)
            segments_tensors_list.append(segments_tensor)
            masked_indices_list.append(masked_indices)
            tokenized_text_list.append(tokenized_text)
            mlm_labels_tensor_list.append(mlm_labels_tensor)
            mlm_label_ids.append(mlm_label_id)
            if (tokens_tensor.shape[1] > max_tokens):
                max_tokens = tokens_tensor.shape[1]

        # apply padding and concatenate tensors
        # use [PAD] for tokens and 0 for segments
        final_tokens_tensor = None
        final_segments_tensor = None
        final_attention_mask = None
        final_mlm_labels_tensor = None
        for tokens_tensor, segments_tensor, mlm_labels_tensor in zip(
                tokens_tensors_list, segments_tensors_list,
                mlm_labels_tensor_list):
            dim_tensor = tokens_tensor.shape[1]
            pad_lenght = max_tokens - dim_tensor
            attention_tensor = torch.full([1, dim_tensor], 1, dtype=torch.long)
            if pad_lenght > 0:
                pad_1 = torch.full([1, pad_lenght],
                                   self.pad_id,
                                   dtype=torch.long)
                pad_2 = torch.full([1, pad_lenght], 0, dtype=torch.long)
                attention_pad = torch.full([1, pad_lenght],
                                           0,
                                           dtype=torch.long)
                pad_3 = torch.full([1, pad_lenght], -100, dtype=torch.long)
                tokens_tensor = torch.cat((tokens_tensor, pad_1), dim=1)
                segments_tensor = torch.cat((segments_tensor, pad_2), dim=1)
                attention_tensor = torch.cat((attention_tensor, attention_pad),
                                             dim=1)
                mlm_labels_tensor = torch.cat((mlm_labels_tensor, pad_3),
                                              dim=1)
            if final_tokens_tensor is None:
                final_tokens_tensor = tokens_tensor
                final_segments_tensor = segments_tensor
                final_attention_mask = attention_tensor
                final_mlm_labels_tensor = mlm_labels_tensor
            else:
                final_tokens_tensor = torch.cat(
                    (final_tokens_tensor, tokens_tensor), dim=0)
                final_segments_tensor = torch.cat(
                    (final_segments_tensor, segments_tensor), dim=0)
                final_attention_mask = torch.cat(
                    (final_attention_mask, attention_tensor), dim=0)
                final_mlm_labels_tensor = torch.cat(
                    (final_mlm_labels_tensor, mlm_labels_tensor), dim=0)

        return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list, final_mlm_labels_tensor, mlm_label_ids

    def __get_input_tensors_batch(self, sentences_list):
        tokens_tensors_list = []
        segments_tensors_list = []
        masked_indices_list = []
        tokenized_text_list = []
        max_tokens = 0
        for sentences in sentences_list:
            tokens_tensor, segments_tensor, masked_indices, tokenized_text = self.__get_input_tensors(
                sentences)
            tokens_tensors_list.append(tokens_tensor)
            segments_tensors_list.append(segments_tensor)
            masked_indices_list.append(masked_indices)
            tokenized_text_list.append(tokenized_text)
            if (tokens_tensor.shape[1] > max_tokens):
                max_tokens = tokens_tensor.shape[1]
        # logger.info("MAX_TOKENS: {}".format(max_tokens))
        # apply padding and concatenate tensors
        # use [PAD] for tokens and 0 for segments
        final_tokens_tensor = None
        final_segments_tensor = None
        final_attention_mask = None
        for tokens_tensor, segments_tensor in zip(tokens_tensors_list,
                                                  segments_tensors_list):
            dim_tensor = tokens_tensor.shape[1]
            pad_lenght = max_tokens - dim_tensor
            attention_tensor = torch.full([1, dim_tensor], 1, dtype=torch.long)
            if pad_lenght > 0:
                pad_1 = torch.full([1, pad_lenght],
                                   self.pad_id,
                                   dtype=torch.long)
                pad_2 = torch.full([1, pad_lenght], 0, dtype=torch.long)
                attention_pad = torch.full([1, pad_lenght],
                                           0,
                                           dtype=torch.long)
                tokens_tensor = torch.cat((tokens_tensor, pad_1), dim=1)
                segments_tensor = torch.cat((segments_tensor, pad_2), dim=1)
                attention_tensor = torch.cat((attention_tensor, attention_pad),
                                             dim=1)
            if final_tokens_tensor is None:
                final_tokens_tensor = tokens_tensor
                final_segments_tensor = segments_tensor
                final_attention_mask = attention_tensor
            else:
                final_tokens_tensor = torch.cat(
                    (final_tokens_tensor, tokens_tensor), dim=0)
                final_segments_tensor = torch.cat(
                    (final_segments_tensor, segments_tensor), dim=0)
                final_attention_mask = torch.cat(
                    (final_attention_mask, attention_tensor), dim=0)
        # logger.info(final_tokens_tensor)
        # logger.info(final_segments_tensor)
        # logger.info(final_attention_mask)
        # logger.info(final_tokens_tensor.shape)
        # logger.info(final_segments_tensor.shape)
        # logger.info(final_attention_mask.shape)
        return final_tokens_tensor, final_segments_tensor, final_attention_mask, masked_indices_list, tokenized_text_list

    def __get_input_tensors(self, sentences, mlm_label=None):

        if len(sentences) > 2:
            logger.info(sentences)
            raise ValueError(
                "BERT accepts maximum two sentences in input for each data point"
            )

        first_tokenized_sentence = [
            self.tokenizer.tokenize(token) if
            ((not token.startswith('[unused')) and
             (token != self.MASK)) else [token]
            for token in sentences[0].split()
        ]
        first_tokenized_sentence = [
            item for sublist in first_tokenized_sentence for item in sublist
        ]
        if self.model_type == 'roberta':
            first_tokenized_sentence = self.tokenizer.tokenize(sentences[0])
        first_segment_id = np.zeros(len(first_tokenized_sentence),
                                    dtype=int).tolist()

        # add [SEP] token at the end
        first_tokenized_sentence.append(self.SEP)
        first_segment_id.append(0)

        if len(sentences) > 1:
            second_tokenized_sentece = [
                self.tokenizer.tokenize(token)
                if not token.startswith('[unused') else [token]
                for token in sentences[1].split()
            ]
            second_tokenized_sentece = [
                item for sublist in second_tokenized_sentece
                for item in sublist
            ]
            if self.model_type == 'roberta':
                second_tokenized_sentece = self.tokenizer.tokenize(
                    sentences[1])
            second_segment_id = np.full(len(second_tokenized_sentece),
                                        1,
                                        dtype=int).tolist()

            # add [SEP] token at the end
            second_tokenized_sentece.append(self.SEP)
            second_segment_id.append(1)

            tokenized_text = first_tokenized_sentence + second_tokenized_sentece
            segments_ids = first_segment_id + second_segment_id
        else:
            tokenized_text = first_tokenized_sentence
            segments_ids = first_segment_id

        # add [CLS] token at the beginning
        tokenized_text.insert(0, self.CLS)
        segments_ids.insert(0, 0)

        # look for masked indices
        masked_indices = []
        for i in range(len(tokenized_text)):
            token = tokenized_text[i]
            if token == self.MASK:
                masked_indices.append(i)

        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)

        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])

        if mlm_label is None:
            return tokens_tensor, segments_tensors, masked_indices, tokenized_text

        # Handle mlm_label
        mlm_labels = np.full(len(tokenized_text), -100, dtype=int).tolist()
        tmp_ids = self.tokenizer.convert_tokens_to_ids(
            self.tokenizer.tokenize(' ' + mlm_label))
        assert (len(tmp_ids) == 1)
        mlm_labels[masked_indices[-1]] = tmp_ids[0]
        mlm_labels_tensor = torch.tensor([mlm_labels])

        return tokens_tensor, segments_tensors, masked_indices, tokenized_text, mlm_labels_tensor, tmp_ids[
            0]

    def __get_token_ids_from_tensor(self, indexed_string):
        token_ids = []
        if self.map_indices is not None:
            # map indices to subset of the vocabulary
            indexed_string = self.convert_ids(indexed_string)
            token_ids = np.asarray(indexed_string)
        else:
            token_ids = indexed_string
        return token_ids

    def get_batch_generation(self, sentences_list, logger=None, try_cuda=True):
        if not sentences_list:
            return None
        if try_cuda:
            self.try_cuda()

        tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list = self.__get_input_tensors_batch(
            sentences_list)

        if logger is not None:
            logger.debug("\n{}\n".format(tokenized_text_list))

        with torch.no_grad():
            logits = self.mlm_model(
                input_ids=tokens_tensor.to(self._model_device),
                token_type_ids=segments_tensor.to(self._model_device),
                attention_mask=attention_mask_tensor.to(self._model_device),
            )

            log_probs = F.log_softmax(logits, dim=-1).cpu()

        token_ids_list = []
        for indexed_string in tokens_tensor.numpy():
            token_ids_list.append(
                self.__get_token_ids_from_tensor(indexed_string))

        return log_probs, token_ids_list, masked_indices_list

    def run_batch(self,
                  sentences_list,
                  samples_list,
                  try_cuda=True,
                  training=True,
                  filter_indices=None,
                  index_list=None,
                  vocab_to_common_vocab=None):
        if try_cuda and torch.cuda.device_count() > 0:
            self.try_cuda()

        tokens_tensor, segments_tensor, attention_mask_tensor, masked_indices_list, tokenized_text_list, mlm_labels_tensor, mlm_label_ids = self._get_input_tensors_batch_train(
            sentences_list, samples_list)

        if training:
            self.mlm_model.train()
            loss = self.mlm_model(
                input_ids=tokens_tensor.to(self._model_device),
                token_type_ids=segments_tensor.to(self._model_device),
                attention_mask=attention_mask_tensor.to(self._model_device),
                masked_lm_labels=mlm_labels_tensor.to(self._model_device),
            )
            loss = loss[0]
        else:
            self.mlm_model.eval()
            with torch.no_grad():
                loss, logits = self.mlm_model(
                    input_ids=tokens_tensor.to(self._model_device),
                    token_type_ids=segments_tensor.to(self._model_device),
                    attention_mask=attention_mask_tensor.to(
                        self._model_device),
                    masked_lm_labels=mlm_labels_tensor.to(self._model_device),
                )
            log_probs = F.log_softmax(logits, dim=-1).cpu()

        if training:
            return loss
        else:
            # During testing, return accuracy and top-k predictions
            tot = log_probs.shape[0]
            cor = 0
            preds = []
            topk = []
            common_vocab_loss = []

            for i in range(log_probs.shape[0]):
                masked_index = masked_indices_list[i][0]
                log_prob = log_probs[i][masked_index]
                mlm_label = mlm_label_ids[i]
                if filter_indices is not None:
                    log_prob = log_prob.index_select(dim=0,
                                                     index=filter_indices)
                    pred_common_vocab = torch.argmax(log_prob)
                    pred = index_list[pred_common_vocab]

                    # get top-k predictions
                    topk_preds = []
                    topk_log_prob, topk_ids = torch.topk(log_prob, self.k)
                    for log_prob_i, idx in zip(topk_log_prob, topk_ids):
                        ori_idx = index_list[idx]
                        token = self.vocab[ori_idx]
                        topk_preds.append({
                            'token': token,
                            'log_prob': log_prob_i.item()
                        })
                    topk.append(topk_preds)

                    # compute entropy on common vocab
                    common_logits = logits[i][masked_index].cpu().index_select(
                        dim=0, index=filter_indices)
                    common_log_prob = -F.log_softmax(common_logits, dim=-1)
                    common_label_id = vocab_to_common_vocab[mlm_label]
                    common_vocab_loss.append(
                        common_log_prob[common_label_id].item())
                else:
                    pred = torch.argmax(log_prob)
                    topk.append([])
                if pred == mlm_labels_tensor[i][masked_index]:
                    cor += 1
                    preds.append(1)
                else:
                    preds.append(0)

            return log_probs, cor, tot, preds, topk, loss, common_vocab_loss
Exemplo n.º 9
0
class BertEncoder(object):
    def __init__(self, device='cpu', model="bert", random=False):

        # config = BertConfig.from_pretrained("bert-large-uncased-whole-word-masking", output_hidden_states=True)
        # self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')
        # self.model = BertForMaskedLM.from_pretrained('bert-large-uncased-whole-word-masking', config = config)

        config = BertConfig.from_pretrained("bert-base-uncased",
                                            output_hidden_states=True)

        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        if not random:
            self.model = BertForMaskedLM.from_pretrained('bert-base-uncased',
                                                         config=config)
        else:
            self.model = BertForMaskedLM(config)
        # config = AlbertConfig.from_pretrained("albert-xlarge-v2", output_hidden_states=True)
        # self.tokenizer = AlbertTokenizer.from_pretrained("albert-xlarge-v2")
        # self.model = AlbertModel.from_pretrained("albert-xlarge-v2", config = config)
        # config = RobertaConfig.from_pretrained("roberta-large", output_hidden_states=True)
        # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        # self.model = RobertaModel.from_pretrained('roberta-large', config = config)

        self.final_transform = self.model.cls.predictions.transform

        self.model.eval()
        self.model.to(device)
        self.device = device

    def tokenize(
            self,
            original_sentence: List[str]) -> Tuple[List[str], Dict[int, int]]:
        """
        Parameters
        ----------
        Returns
        -------
        bert_tokens: The sentence, tokenized by BERT tokenizer.
        orig_to_tok_map: An output dictionary consisting of a mapping (alignment) between indices in the original tokenized sentence, and indices in the sentence tokenized by the BERT tokenizer. See https://github.com/google-research/bert
        """

        bert_tokens = ["[CLS]"]
        orig_to_tok_map = {}
        has_subwords = False
        is_subword = []

        for i, w in enumerate(original_sentence):
            tokenized_w = self.tokenizer.tokenize(w)
            has_subwords = len(tokenized_w) > 1
            is_subword.append(has_subwords)
            bert_tokens.extend(tokenized_w)

            orig_to_tok_map[i] = len(bert_tokens) - 1

        bert_tokens.append("[SEP]")
        return (bert_tokens, orig_to_tok_map)

    def encode(self,
               sentence: str,
               layers: List[int],
               final_transform: bool = True,
               pos_ind=-1,
               mask_prob=1.0):

        tokenized_text, orig2tok = self.tokenize(sentence.split(" "))
        pos_ind_bert = orig2tok[pos_ind]
        if np.random.random() < mask_prob:
            tokenized_text[pos_ind_bert] = "[MASK]"
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        tokens_tensor = torch.tensor([indexed_tokens]).to(self.device)

        with torch.no_grad():
            outputs = self.model(tokens_tensor)
            predictions = torch.cat([outputs[1][layer][0] for layer in layers],
                                    axis=-1)  # .detach().cpu().numpy()
            if final_transform:
                predictions = self.final_transform(predictions)
            predictions = predictions.detach().cpu().numpy()
            """
            if layer >= 0:
                predictions = outputs[2][layer].detach().cpu().numpy()
            else:
                concat = torch.cat(outputs[2], axis = 0)
                concat = concat[:7, :, :]
                predictions = concat.reshape(concat.shape[1], concat.shape[0] * concat.shape[2])

                print(predictions.shape)
                print("----------------------------")
                #predictions = torch.sum(concat, axis = 0).detach().cpu().numpy()
            """
            return (predictions.squeeze(), orig2tok, tokenized_text)