def create_trelm_roberta_model(pretrained_model_path, vocab_path, do_lower_case, vocab_emb_path, vocab_emb_type, save_model_to, langid_list): tokenizer = BertTokenizerFast(vocab_path, do_lower_case=do_lower_case) vocab_emb_weights = None if vocab_emb_type == 'pth': vocab_emb_data = torch.load(vocab_emb_path) vocab_emb_weights = vocab_emb_data['vectors'] assert tokenizer.vocab_size == vocab_emb_weights.size(0) elif vocab_emb_type == 'word2vec': wv_model = KeyedVectors.load_word2vec_format(vocab_emb_path) vocab_emb_weights = torch.FloatTensor(wv_model.vectors) assert tokenizer.vocab_size == vocab_emb_weights.size(0) model = TrelmRobertaForMaskedLM.from_pretrained(pretrained_model_path) if vocab_emb_weights is not None: assert model.config.hidden_size == vocab_emb_weights.size(1) # set the hyperparameters model.config.vocab_size = tokenizer.vocab_size model.config.pad_token_id = tokenizer.pad_token_id # model.config.bos_token_id = tokenizer.bos_token_id # model.config.eos_token_id = tokenizer.eos_token_id model.config.max_position_embeddings = model.config.max_position_embeddings - 1 # model.config.model_type = 'trelm_roberta' model.config.architectures = ['TrelmRobertaForMaskedLM'] model.config.type_vocab_size = 2 model.config.n_langs = 2 model.config.langs_to_id = { langid: idx for idx, langid in enumerate(langid_list) } # initial the word embeddings model.trelm_roberta.embeddings.word_embeddings = nn.Embedding( tokenizer.vocab_size, model.config.hidden_size, padding_idx=model.config.pad_token_id) if vocab_emb_weights is not None: model.trelm_roberta.embeddings.word_embeddings.weight.data.copy_( vocab_emb_weights) else: logger.info('word_embeddings random initialized!') model.trelm_roberta.embeddings.word_embeddings.weight.data.normal_( mean=0.0, std=model.config.initializer_range) # reset lm_head delattr(model, "lm_head") # initial the position embeddings old_position_emb_weight = model.trelm_roberta.embeddings.position_embeddings.weight.data model.trelm_roberta.embeddings.position_embeddings = nn.Embedding( model.config.max_position_embeddings, model.config.hidden_size, padding_idx=model.config.pad_token_id) model.trelm_roberta.embeddings.position_embeddings.weight.data.copy_( old_position_emb_weight[1:]) model.trelm_roberta.embeddings.position_ids = torch.arange( model.config.max_position_embeddings).expand((1, -1)) # initial lang embeddings? # initial type embeddings new_token_type_embeddings = model.trelm_roberta.embeddings.token_type_embeddings.weight.new_empty( model.config.type_vocab_size, model.config.hidden_size) new_token_type_embeddings[ 0, :] = model.trelm_roberta.embeddings.token_type_embeddings.weight model.trelm_roberta.embeddings.token_type_embeddings.weight.data = new_token_type_embeddings # initial the translation layer layer = model.trelm_roberta.encoder.layer[int( model.config.num_hidden_layers / 2)] model.trelm_roberta.encoder.tlayer.attention.self.query.weight = layer.attention.self.query.weight model.trelm_roberta.encoder.tlayer.attention.self.query.bias = layer.attention.self.query.bias model.trelm_roberta.encoder.tlayer.attention.self.key.weight = layer.attention.self.key.weight model.trelm_roberta.encoder.tlayer.attention.self.key.bias = layer.attention.self.key.bias model.trelm_roberta.encoder.tlayer.attention.self.value.weight = layer.attention.self.value.weight model.trelm_roberta.encoder.tlayer.attention.self.value.bias = layer.attention.self.value.bias model.trelm_roberta.encoder.tlayer.attention.output.dense.weight = layer.attention.output.dense.weight model.trelm_roberta.encoder.tlayer.attention.output.dense.bias = layer.attention.output.dense.bias model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.weight = layer.attention.output.LayerNorm.weight model.trelm_roberta.encoder.tlayer.attention.output.LayerNorm.bias = layer.attention.output.LayerNorm.bias model.trelm_roberta.encoder.tlayer.intermediate.dense.weight = layer.intermediate.dense.weight model.trelm_roberta.encoder.tlayer.intermediate.dense.bias = layer.intermediate.dense.bias model.trelm_roberta.encoder.tlayer.output.dense.weight = layer.output.dense.weight model.trelm_roberta.encoder.tlayer.output.dense.bias = layer.output.dense.bias model.trelm_roberta.encoder.tlayer.output.LayerNorm.weight = layer.output.LayerNorm.weight model.trelm_roberta.encoder.tlayer.output.LayerNorm.bias = layer.output.LayerNorm.bias logger.info(f'saving model to {save_model_to}') model.save_pretrained(save_model_to) tokenizer.save_pretrained(save_model_to)
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") print("Config before overwrite max_position_embeddings:", config) config.max_position_embeddings = 4096 config.num_hidden_layers = 6 config.num_attention_heads = 8 config.hidden_size = 512 config.intermediate_size = 2048 print("Config after overwrite max_position_embeddings:", config) # if model_args.tokenizer_name: # tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # raise ValueError( # "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," # "and load it from here, using --tokenizer_name" # ) logging.info("Loading tokenizer") if model_args.tokenizer_name: tokenizer = BertTokenizerFast(model_args.tokenizer_name, clean_text=True, lowercase=False, strip_accents=True) else: raise ValueError("Specify tokenizer name") logging.info("Loading model") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logging.info("Resizing embeddings") model.resize_token_embeddings(len(tokenizer)) print(len(tokenizer.get_vocab()), len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") # Get datasets logging.info("Loading train dataset") train_dataset = get_dataset(data_args) if training_args.do_train else None logging.info("Loading eval dataset") eval_dataset = (get_dataset( data_args, evaluate=True, ) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer logging.info("Initializing trainer") trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: logging.info("Training") model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results