예제 #1
0
    def __init__(self,
                 vocab: str,
                 pretrained_model: str,
                 use_starting_offsets: bool = False,
                 do_lowercase: bool = False,
                 never_lowercase: List[str] = None,
                 max_pieces: int = 512,
                 truncate_long_sequences: bool = False) -> None:
        if pretrained_model.endswith("-cased") and do_lowercase:
            logger.warning("Your BERT model appears to be cased, "
                           "but your indexer is lowercasing tokens.")
        elif pretrained_model.endswith("-uncased") and not do_lowercase:
            logger.warning("Your BERT model appears to be uncased, "
                           "but your indexer is not lowercasing tokens.")

        xlm_tokenizer = XLMTokenizer.from_pretrained(
            pretrained_model, do_lower_case=do_lowercase)
        super().__init__(vocab=vocab,
                         wordpiece_tokenizer=xlm_tokenizer,
                         namespace="xlm",
                         use_starting_offsets=use_starting_offsets,
                         max_pieces=max_pieces,
                         do_lowercase=do_lowercase,
                         never_lowercase=never_lowercase,
                         start_tokens=[xlm_tokenizer.cls_token],
                         end_tokens=[xlm_tokenizer.sep_token],
                         separator_token=xlm_tokenizer.sep_token,
                         truncate_long_sequences=truncate_long_sequences,
                         lang_dict=xlm_tokenizer.lang2id)
예제 #2
0
def get_tokenizer(tokenizer_name):
    log.info(f"\tLoading Tokenizer {tokenizer_name}")
    if tokenizer_name.startswith("bert-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        do_lower_case = tokenizer_name.endswith("uncased")
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        # TransformerXL is trained on data pretokenized with MosesTokenizer
        tokenizer = MosesTokenizer()
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name == "MosesTokenizer":
        tokenizer = MosesTokenizer()
    elif tokenizer_name == "SplitChars":
        tokenizer = SplitCharsTokenizer()
    elif tokenizer_name == "":
        tokenizer = SpaceTokenizer()
    else:
        tokenizer = None
    return tokenizer
예제 #3
0
    def __init__(self,
                 vocab: Vocabulary,
                 tasks: List[str],
                 pretrained_model: str,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 decoders: Dict[str, Model],
                 post_encoder_embedder: TextFieldEmbedder = None,
                 dropout: float = 0.0,
                 word_dropout: float = 0.0,
                 mix_embedding: int = None,
                 layer_dropout: int = 0.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(UdifyModel, self).__init__(vocab, regularizer)

        self.tasks = tasks
        self.vocab = vocab
        self.text_field_embedder = text_field_embedder
        self.post_encoder_embedder = post_encoder_embedder
        self.shared_encoder = encoder
        self.word_dropout = word_dropout
        self.dropout = torch.nn.Dropout(p=dropout)
        self.decoders = torch.nn.ModuleDict(decoders)

        if 'bert' in pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                                           do_lower_case=False)
        elif 'xlm' in pretrained_model:
            self.tokenizer = XLMTokenizer.from_pretrained(pretrained_model,
                                                          do_lower_case=False)
        else:
            raise ConfigurationError(
                f"No corresponding pretrained model for tokenizer.")

        if mix_embedding:
            self.scalar_mix = torch.nn.ModuleDict({
                task: ScalarMixWithDropout(mix_embedding,
                                           do_layer_norm=False,
                                           dropout=layer_dropout)
                for task in self.decoders
            })
        else:
            self.scalar_mix = None

        self.metrics = {}

        for task in self.tasks:
            if task not in self.decoders:
                raise ConfigurationError(
                    f"Task {task} has no corresponding decoder. Make sure their names match."
                )

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")

        initializer(self)
        self._count_params()
예제 #4
0
def add_pytorch_transformers_vocab(vocab, tokenizer_name):
    """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data.

    These tokenizers have a convert_tokens_to_ids method, but this doesn't do
    anything special, so we can just use the standard indexers.
    """
    do_lower_case = "uncased" in tokenizer_name

    if tokenizer_name.startswith("bert-"):
        tokenizer = BertTokenizer.from_pretrained(tokenizer_name,
                                                  do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("roberta-"):
        tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlnet-"):
        tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name,
                                                   do_lower_case=do_lower_case)
    elif tokenizer_name.startswith("openai-gpt"):
        tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("gpt2"):
        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("transfo-xl-"):
        tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name)
    elif tokenizer_name.startswith("xlm-"):
        tokenizer = XLMTokenizer.from_pretrained(tokenizer_name)

    if (tokenizer_name.startswith("openai-gpt")
            or tokenizer_name.startswith("gpt2")
            or tokenizer_name.startswith("transo-xl-")):
        tokenizer.add_special_tokens({
            "bos_token": "<start>",
            "sep_token": "<delim>",
            "cls_token": "<extract>"
        })
    # TODO: this is another place can be simplified by "model-before-preprocess" reorganization
    # we can pass tokenizer created in model here, see issue <TBD>

    vocab_size = len(tokenizer)
    # do not use tokenizer.vocab_size, it does not include newly added token
    if tokenizer_name.startswith("roberta-"):
        if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None:
            vocab_size -= 1
        else:
            log.info("Time to delete vocab_size-1 in preprocess.py !!!")
    # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove
    # this when they fix the problem

    ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size))
    log.info("Added pytorch_transformers vocab (%s): %d tokens",
             tokenizer_name, len(ordered_vocab))
    for word in ordered_vocab:
        vocab.add_token_to_namespace(
            word, input_module_tokenizer_name(tokenizer_name))
예제 #5
0
 def __init__(self,
              chunck_size=64,
              max_length=35,
              device=torch.device('cuda:0')):
     super(XLMClient, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
     self.max_length = max_length
     # load the model
     self.model = XLMModel.from_pretrained('xlm-mlm-en-2048')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
예제 #6
0
 def __init__(self):
     self.device = None
     self.n_gpu = None
     self.model = None
     self.processor = None
     self.evaluator = None
     self.parser = argparse.ArgumentParser()
     self.setup_args()
     self.args = self.parser.parse_args()
     self.logger = None
     self.setup_misc()
     if self.args.encoder == 'bert':
         self.tokenizer = BertTokenizer.from_pretrained(self.args.bert_mode)
     elif self.args.encoder == 'xlm':
         self.logger.warn(
             'WE ASSUME THE TEXT IS PRETOKENIZED AND ONLY DO THE FOLLOWING:'
         )
         self.logger.warn(
             '1. Remove invalid character and clean up white space')
         self.logger.warn(
             '2. Lower case, renormalize unicode and strip accents')
         self.logger.warn(
             '3. Split text by white space and do BPE on each "word"')
         if self.args.diff_bpe:
             if self.args.lang_prefix:
                 self.tokenizer = LangPrefixXLMTokenizer.from_pretrained(
                     self.args.xlm_mode,
                     do_lowercase_and_remove_accent=self.args.do_lower_case)
             else:
                 self.tokenizer = NewXLMTokenizer.from_pretrained(
                     self.args.xlm_mode,
                     do_lowercase_and_remove_accent=self.args.do_lower_case)
         else:
             self.tokenizer = XLMTokenizer.from_pretrained(
                 self.args.xlm_mode,
                 do_lowercase_and_remove_accent=self.args.do_lower_case)
     else:
         raise ValueError(self.args.encoder)
     self.param_optimizer, self.optimizer, self.scheduler = None, None, None
     self.global_step, self.total_step = 0, 0
     self.num_train_steps, self.warmup_linear = None, None
     self.last_eval, self.best_eval = 0, float('-inf')
예제 #7
0
def test_xlm_embeddings():
    xlm_model: str = "xlm-mlm-en-2048"

    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<s>" + s + "</s>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #    0      1             2           3            4          5         6         7         8       9      10        11       12         13        14
    #
    #   <s>  'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s>
    #           |             |           |            |          |         |         |         \      |      /          |         |          |
    #         Berlin         and        Munich        have        a        lot        of           puppeteer             to       see         .
    #
    #           0             1           2            3          4         5          6               7                  8        9          10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = XLMEmbeddings(
            model=xlm_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * model.embeddings.embedding_dim
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * model.embeddings.embedding_dim
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
예제 #8
0
def test_xlm_embeddings():
    xlm_model = 'xlm-mlm-en-2048'
    tokenizer = XLMTokenizer.from_pretrained(xlm_model)
    model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model,
                                     output_hidden_states=True)
    model.to(flair.device)
    model.eval()
    s = 'Berlin and Munich have a lot of puppeteer to see .'
    with torch.no_grad():
        tokens = tokenizer.tokenize((('<s>' + s) + '</s>'))
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)
        hidden_states = model(tokens_tensor)[(-1)]
        first_layer = hidden_states[1][0]
    assert (len(first_layer) == len(tokens))

    def embed_sentence(sentence: str,
                       pooling_operation,
                       layers: str = '1',
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model,
                                   layers=layers,
                                   pooling_operation=pooling_operation,
                                   use_scalar_mix=use_scalar_mix)
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)
        return flair_sentence

    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation='first')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_subword_embedding_ref = first_layer[8].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation='last')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_last_subword_embedding_ref = first_layer[10].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation='first_last')
    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[1]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[8], first_layer[10]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation='mean')
    first_token_embedding_ref = calculate_mean_embedding([first_layer[1]
                                                          ]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()
    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[8], first_layer[9], first_layer[10]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)
    sentence_mult_layers = embed_sentence(sentence='Munich',
                                          pooling_operation='first',
                                          layers='1,2,3,4')
    ref_embedding_size = (4 * model.embeddings.embedding_dim)
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
    sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin',
                                                     pooling_operation='first',
                                                     layers='1,2,3,4',
                                                     use_scalar_mix=True)
    ref_embedding_size = (1 * model.embeddings.embedding_dim)
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)