예제 #1
0
def create_trelm_roberta_model(pretrained_model_path, vocab_path,
                               do_lower_case, vocab_emb_path, vocab_emb_type,
                               save_model_to, langid_list):

    tokenizer = BertTokenizerFast(vocab_path, do_lower_case=do_lower_case)

    vocab_emb_weights = None
    if vocab_emb_type == 'pth':
        vocab_emb_data = torch.load(vocab_emb_path)
        vocab_emb_weights = vocab_emb_data['vectors']
        assert tokenizer.vocab_size == vocab_emb_weights.size(0)
    elif vocab_emb_type == 'word2vec':
        wv_model = KeyedVectors.load_word2vec_format(vocab_emb_path)
        vocab_emb_weights = torch.FloatTensor(wv_model.vectors)
        assert tokenizer.vocab_size == vocab_emb_weights.size(0)

    model = TrelmRobertaForMaskedLM.from_pretrained(pretrained_model_path)

    if vocab_emb_weights is not None:
        assert model.config.hidden_size == vocab_emb_weights.size(1)

    # set the hyperparameters
    model.config.vocab_size = tokenizer.vocab_size
    model.config.pad_token_id = tokenizer.pad_token_id
    # model.config.bos_token_id = tokenizer.bos_token_id
    # model.config.eos_token_id = tokenizer.eos_token_id
    model.config.max_position_embeddings = model.config.max_position_embeddings - 1  #
    model.config.model_type = 'trelm_roberta'
    model.config.architectures = ['TrelmRobertaForMaskedLM']
    model.config.type_vocab_size = 2
    model.config.n_langs = 2
    model.config.langs_to_id = {
        langid: idx
        for idx, langid in enumerate(langid_list)
    }

    # initial the word embeddings
    model.trelm_roberta.embeddings.word_embeddings = nn.Embedding(
        tokenizer.vocab_size,
        model.config.hidden_size,
        padding_idx=model.config.pad_token_id)
    if vocab_emb_weights is not None:
        model.trelm_roberta.embeddings.word_embeddings.weight.data.copy_(
            vocab_emb_weights)
    else:
        logger.info('word_embeddings random initialized!')
        model.trelm_roberta.embeddings.word_embeddings.weight.data.normal_(
            mean=0.0, std=model.config.initializer_range)

    # reset lm_head
    delattr(model, "lm_head")

    # initial the position embeddings
    old_position_emb_weight = model.trelm_roberta.embeddings.position_embeddings.weight.data
    model.trelm_roberta.embeddings.position_embeddings = nn.Embedding(
        model.config.max_position_embeddings,
        model.config.hidden_size,
        padding_idx=model.config.pad_token_id)
    model.trelm_roberta.embeddings.position_embeddings.weight.data.copy_(
        old_position_emb_weight[1:])
    model.trelm_roberta.embeddings.position_ids = torch.arange(
        model.config.max_position_embeddings).expand((1, -1))

    # initial lang embeddings?

    # initial type embeddings
    new_token_type_embeddings = model.trelm_roberta.embeddings.token_type_embeddings.weight.new_empty(
        model.config.type_vocab_size, model.config.hidden_size)
    new_token_type_embeddings[
        0, :] = model.trelm_roberta.embeddings.token_type_embeddings.weight
    model.trelm_roberta.embeddings.token_type_embeddings.weight.data = new_token_type_embeddings

    # initial the translation layer
    layer = model.trelm_roberta.encoder.layer[int(
        model.config.num_hidden_layers / 2)]

    model.trelm_roberta.encoder.tlayer.attention.self.query.weight = layer.attention.self.query.weight
    model.trelm_roberta.encoder.tlayer.attention.self.query.bias = layer.attention.self.query.bias
    model.trelm_roberta.encoder.tlayer.attention.self.key.weight = layer.attention.self.key.weight
    model.trelm_roberta.encoder.tlayer.attention.self.key.bias = layer.attention.self.key.bias
    model.trelm_roberta.encoder.tlayer.attention.self.value.weight = layer.attention.self.value.weight
    model.trelm_roberta.encoder.tlayer.attention.self.value.bias = layer.attention.self.value.bias

    model.trelm_roberta.encoder.tlayer.attention.output.dense.weight = layer.attention.output.dense.weight
    model.trelm_roberta.encoder.tlayer.attention.output.dense.bias = layer.attention.output.dense.bias
    model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.weight = layer.attention.output.LayerNorm.weight
    model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.bias = layer.attention.output.LayerNorm.bias

    model.trelm_roberta.encoder.tlayer.intermediate.dense.weight = layer.intermediate.dense.weight
    model.trelm_roberta.encoder.tlayer.intermediate.dense.bias = layer.intermediate.dense.bias

    model.trelm_roberta.encoder.tlayer.output.dense.weight = layer.output.dense.weight
    model.trelm_roberta.encoder.tlayer.output.dense.bias = layer.output.dense.bias
    model.trelm_roberta.encoder.tlayer.output.LayerNorm.weight = layer.output.LayerNorm.weight
    model.trelm_roberta.encoder.tlayer.output.LayerNorm.bias = layer.output.LayerNorm.bias

    logger.info(f'saving model to {save_model_to}')
    model.save_pretrained(save_model_to)
    tokenizer.save_pretrained(save_model_to)
예제 #2
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            cache_dir=model_args.cache_dir)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    print("Config before overwrite max_position_embeddings:", config)
    config.max_position_embeddings = 4096
    config.num_hidden_layers = 6
    config.num_attention_heads = 8
    config.hidden_size = 512
    config.intermediate_size = 2048
    print("Config after overwrite max_position_embeddings:", config)

    # if model_args.tokenizer_name:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    # elif model_args.model_name_or_path:
    #     tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    # else:
    #     raise ValueError(
    #         "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
    #         "and load it from here, using --tokenizer_name"
    #     )

    logging.info("Loading tokenizer")
    if model_args.tokenizer_name:
        tokenizer = BertTokenizerFast(model_args.tokenizer_name,
                                      clean_text=True,
                                      lowercase=False,
                                      strip_accents=True)
    else:
        raise ValueError("Specify tokenizer name")

    logging.info("Loading model")
    if model_args.model_name_or_path:
        model = AutoModelWithLMHead.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelWithLMHead.from_config(config)

    logging.info("Resizing embeddings")
    model.resize_token_embeddings(len(tokenizer))
    print(len(tokenizer.get_vocab()), len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the"
            "--mlm flag (masked language modeling).")

    # Get datasets
    logging.info("Loading train dataset")
    train_dataset = get_dataset(data_args) if training_args.do_train else None
    logging.info("Loading eval dataset")
    eval_dataset = (get_dataset(
        data_args,
        evaluate=True,
    ) if training_args.do_eval else None)
    if config.model_type == "xlnet":
        data_collator = DataCollatorForPermutationLanguageModeling(
            tokenizer=tokenizer,
            plm_probability=data_args.plm_probability,
            max_span_length=data_args.max_span_length,
        )
    else:
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=data_args.mlm,
            mlm_probability=data_args.mlm_probability,
        )

    # Initialize our Trainer
    logging.info("Initializing trainer")
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        logging.info("Training")
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results