def __init__( self, model_name: str, mlm: bool = True, mlm_probability: float = 0.15, filed_name: str = "source", namespace: str = "tokens", ): self._field_name = filed_name self._namespace = namespace from allennlp.common import cached_transformers tokenizer = cached_transformers.get_tokenizer(model_name) self._collator = DataCollatorForLanguageModeling( tokenizer, mlm, mlm_probability)
def get_loaders( self, stage: str, epoch: int = None, ) -> "OrderedDict[str, DataLoader]": """ Returns loaders for the stage Args: stage: string with stage name epoch: epoch Returns: Dict of loaders """ data_params = dict(self.stages_config[stage]["data_params"]) model_name = data_params["model_name"] tokenizer = AutoTokenizer.from_pretrained(model_name) collate_fn = DataCollatorForLanguageModeling(tokenizer) loaders_params = { "train": { "collate_fn": collate_fn }, "valid": { "collate_fn": collate_fn }, } loaders = utils.get_loaders_from_params( get_datasets_fn=self.get_datasets, initial_seed=self.initial_seed, stage=stage, loaders_params=loaders_params, **data_params, ) return loaders
class LanguageModelingDataCollator(DataCollator): """ Register as an `DataCollator` with name `LanguageModelingDataCollator` Used for language modeling. """ def __init__( self, model_name: str, mlm: bool = True, mlm_probability: float = 0.15, filed_name: str = "source", namespace: str = "tokens", ): self._field_name = filed_name self._namespace = namespace from allennlp.common import cached_transformers tokenizer = cached_transformers.get_tokenizer(model_name) self._collator = DataCollatorForLanguageModeling( tokenizer, mlm, mlm_probability) def __call__(self, instances: List[Instance]) -> TensorDict: tensor_dicts = allennlp_collate(instances) tensor_dicts = self.process_tokens(tensor_dicts) return tensor_dicts def process_tokens(self, tensor_dicts: TensorDict) -> TensorDict: inputs = tensor_dicts[self._field_name][self._namespace]["token_ids"] inputs, labels = self._collator.mask_tokens(inputs) tensor_dicts[self._field_name][self._namespace]["token_ids"] = inputs tensor_dicts[self._field_name][self._namespace]["labels"] = labels return tensor_dicts
def _train_model(self, model, tokenizer, train_dataset, val_dataset, **train_kwargs): data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True) train_args = self._get_train_args(**train_kwargs) trainer = transformers.Trainer(model=model, args=train_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train()
def __init__( self, model_name: str, mlm: bool = True, mlm_probability: float = 0.15, filed_name: str = "source", namespace: str = "tokens", ): self._field_name = filed_name self._namespace = namespace from allennlp.common import cached_transformers tokenizer = cached_transformers.get_tokenizer(model_name) self._collator = DataCollatorForLanguageModeling(tokenizer, mlm, mlm_probability) if hasattr(self._collator, "mask_tokens"): # For compatibility with transformers < 4.10 self._mask_tokens = self._collator.mask_tokens else: self._mask_tokens = self._collator.torch_mask_tokens
def test_is_running(): """Test if perplexity is running normal""" tok = AutoTokenizer.from_pretrained("distilbert-base-uncased") model = AutoModelWithLMHead.from_pretrained("distilbert-base-uncased") dataset = LanguageModelingDataset(texts, tok) collate_fn = DataCollatorForLanguageModeling(tok).collate_batch dataloader = torch.utils.data.DataLoader(dataset, collate_fn=collate_fn) optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) runner = HuggingFaceRunner() runner.train( model=model, optimizer=optimizer, loaders={"train": dataloader}, callbacks={ "optimizer": dl.OptimizerCallback(), "perplexity": PerplexityMetricCallback(), }, check=True, )
def __init__(self, dataset: Dataset, device: torch.device, num_labels: int = 2, averaging: AnyStr = 'binary', pad_token_id: int = None, mlm: bool = False, multi_gpu: bool = False, sequence_modeling: bool = False, ensemble_edu: bool = False, ensemble_sent: bool = False): self.dataset = dataset if isinstance(dataset, Subset): self.all_labels = list(dataset.dataset.getLabels(dataset.indices)) else: self.all_labels = dataset.getLabels() if sequence_modeling: collator = collate_sequence_batch_transformer else: collator = collate_batch_transformer if mlm: collate_fn = DataCollatorForLanguageModeling(dataset.tokenizer) elif pad_token_id is None: collate_fn = partial(collator, dataset.tokenizer.pad_token_id) else: collate_fn = partial(collator, pad_token_id) self.dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn) self.device = device self.averaging = averaging self.num_labels = num_labels self.mlm = mlm self.pad_token_id = pad_token_id self.multi_gpu = multi_gpu self.sequence_modeling = sequence_modeling self.ensemble_edu = ensemble_edu self.ensemble_sent = ensemble_sent
def test_runner(): """Test that runner executes""" train_df = pd.read_csv("data/train.csv") valid_df = pd.read_csv("data/valid.csv") teacher_config = AutoConfig.from_pretrained("bert-base-uncased", output_hidden_states=True, output_logits=True) teacher = BertForMaskedLM.from_pretrained("bert-base-uncased", config=teacher_config) student_config = AutoConfig.from_pretrained( "distilbert-base-uncased", output_hidden_states=True, output_logits=True, ) student = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased", config=student_config) tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") train_dataset = LanguageModelingDataset(train_df["text"], tokenizer) valid_dataset = LanguageModelingDataset(valid_df["text"], tokenizer) collate_fn = DataCollatorForLanguageModeling(tokenizer) train_dataloader = DataLoader(train_dataset, collate_fn=collate_fn, batch_size=2) valid_dataloader = DataLoader(valid_dataset, collate_fn=collate_fn, batch_size=2) loaders = {"train": train_dataloader, "valid": valid_dataloader} callbacks = { "masked_lm_loss": MaskedLanguageModelCallback(), "mse_loss": MSELossCallback(), "cosine_loss": CosineLossCallback(), "kl_div_loss": KLDivLossCallback(), "loss": MetricAggregationCallback( prefix="loss", mode="weighted_sum", metrics={ "cosine_loss": 1.0, "masked_lm_loss": 1.0, "kl_div_loss": 1.0, "mse_loss": 1.0, }, ), "optimizer": dl.OptimizerCallback(), "perplexity": PerplexityMetricCallbackDistillation(), } model = torch.nn.ModuleDict({"teacher": teacher, "student": student}) runner = DistilMLMRunner() optimizer = torch.optim.Adam(model.parameters(), lr=5e-5) runner.train( model=model, optimizer=optimizer, loaders=loaders, verbose=True, check=True, callbacks=callbacks, ) assert True
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) ''' --output_dir=output --model_type=gpt2 --model_name_or_path=gpt2 --do_train --train_data_file=/wiki.train.raw --do_eval --eval_data_file=/wiki.test.raw ''' model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") # 下载权重. if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) # 就是之前的复用,然后新加的用init生成参数. if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset( data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset( data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def get_data_collator(): return DataCollatorForLanguageModeling(tokenizer=tokenizer)
from distiller import Distiller # Getting dataset df = pd.read_csv("./data/SST-2/train.tsv", encoding='utf-8', sep='\t') # len df is 67.349, since we are working on cpu we only take 3000 train_df = df.iloc[:3000] # Getting teachers tokenizer and preparing data teacher_model_name = "bert-base-uncased" student_model_name = "distilbert-base-uncased" tokenizer = BertTokenizer.from_pretrained(teacher_model_name) dataset = LanguageModelingDataset(train_df["sentence"], teacher_model_name, sort=False) collate_fn = DataCollatorForLanguageModeling(tokenizer) dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32) # Getting teacher and student model teacher = BertForMaskedLM.from_pretrained(teacher_model_name) student = DistilBertForMaskedLM.from_pretrained(student_model_name) # needed paramteres for training params = { "n_epoch": 3, "temperature": 2.0, "alpha_ce": 0.5, "alpha_mlm": 2.0, "alpha_cos": 1.0, "alpha_mse": 1.0, "gradient_accumulation_steps": 50,