def test_sop(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), "token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]), "sentence_order_label": i, } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5]) self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5]) self.assertEqual(batch["labels"].shape.as_list(), [2, 5]) self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2]) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8]) self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
def test_sop(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": torch.tensor([0, 1, 2, 3, 4]), "token_type_ids": torch.tensor([0, 1, 2, 3, 4]), "sentence_order_label": i, } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5))) self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5))) self.assertEqual(batch["labels"].shape, torch.Size((2, 5))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size( (2, ))) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8) batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8))) self.assertEqual(batch["labels"].shape, torch.Size((2, 8))) self.assertEqual(batch["sentence_order_label"].shape, torch.Size( (2, )))
def test_nsp(self): tokenizer = BertTokenizer(self.vocab_file) features = [{ "input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i } for i in range(2)] data_collator = DataCollatorForLanguageModeling(tokenizer, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 5)) self.assertEqual(batch["token_type_ids"].shape, (2, 5)) self.assertEqual(batch["labels"].shape, (2, 5)) self.assertEqual(batch["next_sentence_label"].shape, (2, )) data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 8)) self.assertEqual(batch["token_type_ids"].shape, (2, 8)) self.assertEqual(batch["labels"].shape, (2, 8)) self.assertEqual(batch["next_sentence_label"].shape, (2, ))
def __init__(self, file_path, sets, bucket_size, max_timestep=0, drop=False, acoustic_config=None, semantic_config=None, tokenizer=None): super().__init__(file_path, sets, bucket_size, max_timestep, drop) self.acoustic_config = acoustic_config self.semantic_config = semantic_config self.tokenizer = tokenizer self.mlm_collater = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15) self.sample_step = 0 X_a = self.table['file_path'].tolist() X_lens = self.table['length'].tolist() X_t = self.table['align_path'].tolist() # Use bucketing to allow different batch size at run time self.X_a, self.X_t = [], [] batch_x_a, batch_len, batch_x_t = [], [], [] for x_a, x_len, x_t in zip(X_a, X_lens, X_t): batch_x_a.append(x_a) batch_len.append(x_len) batch_x_t.append(x_t) # Fill in batch_x until batch is full if len(batch_x_a) == bucket_size: # Half the batch size if seq too long if (bucket_size >= 2) and (max(batch_len) > HALF_BATCHSIZE_TIME ) and self.sample_step == 0: self.X_a.append(batch_x_a[:bucket_size // 2]) self.X_a.append(batch_x_a[bucket_size // 2:]) self.X_t.append(batch_x_t[:bucket_size // 2]) self.X_t.append(batch_x_t[bucket_size // 2:]) else: self.X_a.append(batch_x_a) self.X_t.append(batch_x_t) batch_x_a, batch_len, batch_x_t = [], [], [] # Gather the last batch if len(batch_x_a) > 1: self.X_a.append(batch_x_a) self.X_t.append(batch_x_t) assert len(self.X_a) == len(self.X_t)
def test_data_collator_for_language_modeling(self): tokenizer = BertTokenizer(self.vocab_file) no_pad_features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] pad_features = [{ "input_ids": list(range(5)) }, { "input_ids": list(range(10)) }] data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) tokenizer._pad_token = None data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) with self.assertRaises(ValueError): # Expect error due to padding token missing data_collator(pad_features) set_seed(42) # For reproducibility tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForLanguageModeling(tokenizer) batch = data_collator(no_pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist())) batch = data_collator(pad_features) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10))) self.assertEqual(batch["labels"].shape, torch.Size((2, 10))) masked_tokens = batch["input_ids"] == tokenizer.mask_token_id self.assertTrue(torch.any(masked_tokens)) self.assertTrue( all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def __init__(self, config: Munch): self.config = config # ---- Neuron ---- self.neuron = Neuron(self.config) # ---- Model ---- self.model = BertMLMSynapse(self.config) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.config.session.learning_rate, momentum=self.config.session.momentum) self.scheduler = WarmupCosineWithHardRestartsSchedule( self.optimizer, 50, 300) # ---- Dataset ---- # Dataset: 74 million sentences pulled from books. self.dataset = load_dataset('bookcorpus')['train'] # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict # is produced by the tokenizer. self.data_collator = DataCollatorForLanguageModeling( tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15) # ---- Logging ---- self.tensorboard = SummaryWriter(log_dir=self.config.session.full_path) if self.config.session.record_log: logger.add( self.config.session.full_path + "/{}_{}.log".format( self.config.session.name, self.config.session.trial_uid), format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
def create_trainer(tokenizer, model): dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="data/processed/recipes_train.txt", block_size=256, ) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) training_args = TrainingArguments( output_dir="./artifacts", overwrite_output_dir=True, num_train_epochs=1, per_gpu_train_batch_size=128, save_steps=100_000_000, save_total_limit=2, fp16=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) return trainer
def __init__(self, config: Munch = None, **kwargs): if config == None: config = Miner.default_config(); bittensor.config.Config.update_with_kwargs(config.miner, kwargs) Miner.check_config(config) self.config = config # ---- Model ---- self.model = BertMLMSynapse( self.config ) # ---- Optimizer ---- self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum) self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300) # ---- Model Load/Save tools ---- self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD) # ---- Dataset ---- # Dataset: 74 million sentences pulled from books. self.dataset = load_dataset('ag_news')['train'] # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict # is produced by the tokenizer. self.data_collator = DataCollatorForLanguageModeling ( tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15 ) super( Miner, self ).__init__( self.config, **kwargs )
def collate_fn(self) -> Callable: if self.cfg.wwm: return DataCollatorForWholeWordMask( self.tokenizer, mlm_probability=self.cfg.mlm_probability) else: return DataCollatorForLanguageModeling( self.tokenizer, mlm_probability=self.cfg.mlm_probability)
def execute(self, environment_path: str) -> None: dataset = LineByLineTextDataset(tokenizer=self.tokenizer, file_path=self.file_path, block_size=self.block_size) data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=True, mlm_probability=self.mlm_probability) training_args = TrainingArguments( output_dir=os.path.join(environment_path, "temp"), overwrite_output_dir=True, num_train_epochs=self.epochs, per_gpu_train_batch_size=self.batch_size_per_gpu, save_steps=self.save_steps, save_total_limit=self.save_total_limit, ) trainer = Trainer( model=self.model, args=training_args, data_collator=data_collator, train_dataset=dataset, prediction_loss_only=True, ) trainer.train() trainer.save_model(os.path.join(environment_path, "model")) self.tokenizer.save_pretrained( os.path.join(environment_path, "tokenizer"))
def __init__(self, data_dir: Path, tokenizer: PreTrainedTokenizer, dataset: Dataset, local_rank=-1): assert data_dir, "data_dir input needed" self.model_dir = f"{data_dir}/results" self.dataset = dataset self.config = RobertaConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) self.training_args = TrainingArguments( run_name=data_dir.name, local_rank=local_rank, learning_rate=0.00005, # default 0.00005 output_dir=f"{self.model_dir}", overwrite_output_dir=False, num_train_epochs=1, per_device_train_batch_size=48, # Nvidia K80 99% seed=42, save_steps=10_000, save_total_limit=1, ) self.data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
def get_dataloaders(model, tokenizer, batch_size, train_path, eval_path): block_size = 1024 train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=block_size) test_dataset = TextDataset(tokenizer=tokenizer, file_path=eval_path, block_size=block_size) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, mlm_probability=0.15) trainloader = DataLoader( train_dataset, batch_size=batch_size, collate_fn=data_collator, drop_last=False, num_workers=0, ) testloader = DataLoader( test_dataset, batch_size=batch_size, collate_fn=data_collator, drop_last=False, num_workers=0, ) return trainloader, testloader
def train(self, num_epochs=500, batch_size=32, save_total_limit=2, save_steps=500, logging_steps=100): training_args = TrainingArguments( output_dir=f"./saved/{self.model_name}", overwrite_output_dir=True, num_train_epochs=num_epochs, per_device_train_batch_size=batch_size, save_steps=save_steps, save_total_limit=save_total_limit, prediction_loss_only=True, logging_steps=logging_steps) data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=True, mlm_probability=MASK_PROB) card_trainset = CardDataset(self.train_path, self.tokenizer, to_tensor=True) trainer = Trainer(model=self.model, args=training_args, data_collator=data_collator, train_dataset=card_trainset) self.tokenizer.save_pretrained(f"./saved/{self.model_name}") trainer.train() trainer.save_model(f"./saved/{self.model_name}")
def train(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, train_batch_size: int, eval_batch_size: int, mlm=False): data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm, mlm_probability=0.15) args = TrainingArguments() args.per_gpu_train_batch_size = train_batch_size args.per_gpu_eval_batch_size = eval_batch_size args.per_device_train_batch_size = train_batch_size train_dataset, eval_dataset = dataset.get_dataset(tokenizer) trainer = Trainer( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) trainer.train(model_path=None) trainer.save_model()
def __init__(self, opts): # Command line arguments self.opts = opts # Load model and tokenizer config = AutoConfig.from_pretrained(opts.ckpt_file) self.tokenizer = AutoTokenizer.from_pretrained(opts.ckpt_file) self.model = AutoModelWithLMHead.from_pretrained(opts.ckpt_file, config=config) self.model.resize_token_embeddings(len(self.tokenizer)) # Load training arguments if opts.mode == 'train' or opts.mode == 'eval': self.training_args = TrainingArguments self.training_args.device = 'cpu' self.training_args.n_gpu = 0 self.training_args.logging_dir = opts.output_dir self.training_args.output_dir = opts.output_dir self.training_args.num_train_epochs = opts.num_epochs self.training_args.learning_rate = opts.learning_rate self.training_args.train_batch_size = opts.batch_size self.training_args.eval_batch_size = opts.batch_size # Load dataset if opts.mode == 'train' or opts.mode == 'eval': self.dataset = LineByLineTextDataset( # TextDataset tokenizer=self.tokenizer, file_path=opts.text_file, block_size=self.tokenizer.max_len) self.data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False)
def get_trainer(model_file, train_file, valid_file, model_output_dir): model = AutoModelForMaskedLM.from_pretrained(model_file) tokenizer = AutoTokenizer.from_pretrained(model_file, do_lower_case=True, use_fast=False) lm_datasets = create_dataset(tokenizer, train_file, valid_file) print(tokenizer.decode(lm_datasets["train"][1]["input_ids"])) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) training_args = TrainingArguments( output_dir=model_output_dir, # output directory overwrite_output_dir=True, num_train_epochs=3, # total # of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir=path+'logs', # directory for storing logs ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=lm_datasets["train"], eval_dataset=lm_datasets["validation"], # evaluation dataset data_collator=data_collator, ) return trainer
def train(self, epochs, lr=5e-5, batch_size=1): data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False) training_args = TrainingArguments( output_dir="./output-lm", no_cuda=(self.device != torch.device("cuda")), num_train_epochs=epochs, per_device_train_batch_size=batch_size, # save_steps=10, save_total_limit=1, learning_rate=lr, evaluation_strategy="epoch", logging_steps=float("inf"), prediction_loss_only=False, ) self.trainer = Trainer( model=self.model, args=training_args, data_collator=data_collator, train_dataset=self.train_dataset, eval_dataset=self.val_dataset, ) self.trainer.train() return self.trainer
def __init__(self, evaluate: bool = False): self.model_args = ModelArguments(model_name_or_path=None, model_type='bert', tokenizer_name='models/danbert-small/vocab.json', config_name="models/danbert-small/config.json") self.data_args = DataTrainingArguments(train_data_file="handler/datadir/da-train.txt", eval_data_file="handler/datadir/da-eval.txt", labels="handler/datadir/labels.txt", mlm=True, line_by_line=True) self.training_args = TrainingArguments(output_dir="models/danbert-small", num_train_epochs=3, per_gpu_eval_batch_size=8, save_steps=750, seed=42, learning_rate=1e-4, save_total_limit=2) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if self.training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", self.training_args.local_rank, self.training_args.device, self.training_args.n_gpu, bool(self.training_args.local_rank != -1), self.training_args.fp16, ) logger.info("Training/evaluation parameters %s", self.training_args) set_seed(self.training_args.seed) config = CONFIG_MAPPING[self.model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") tokenizer = AutoTokenizer.from_pretrained('models/danbert-small') model = AutoModelWithLMHead.from_config(config) # model.resize_token_embeddings(len(tokenizer)) if self.data_args.block_size <= 0: self.data_args.block_size = 512 # Our input block size will be the max possible for the model else: self.data_args.block_size = min(self.data_args.block_size, 512) train_dataset = ( self.get_dataset(self.data_args, tokenizer=tokenizer, local_rank=self.training_args.local_rank, evaluate=True) ) eval_dataset = ( self.get_dataset(self.data_args, tokenizer=tokenizer, local_rank=self.training_args.local_rank, evaluate=True) ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=self.data_args.mlm, mlm_probability=self.data_args.mlm_probability) trainer = Trainer( model=model, args=self.training_args, data_collator=data_collator, train_dataset=train_dataset, # eval_dataset=eval_dataset, prediction_loss_only=True, ) trainer.train(model_path=None) trainer.save_model()
def main(): print("PREPROCESSING DATA") preprocess() print("LOADING TOKENIZER") tokenizer = get_tokenizer() data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) print("LOADING MODEL", cfg('model')) model = get_model(tokenizer) print("LOADING DATA") if cfg('encoding') == 'LBL': train_dataset = LBLDataset(tokenizer=tokenizer, file_path=filename('train')) elif cfg('encoding') == 'blocked': train_dataset = BlockedDataset(tokenizer=tokenizer, file_path=filename('train')) elif cfg('encoding') == 'text': train_dataset = TextDataset(tokenizer=tokenizer, file_path=filename('train'), block_size=cfg('max_block')) elif cfg('encoding').startswith('inter'): if cfg('encoding').endswith('LBL'): loader = LBLDataset elif cfg('encoding').endswith('blocked'): loader = BlockedDataset d1 = loader(tokenizer=tokenizer, file_path=filename('train')) d2 = loader(tokenizer=tokenizer, file_path=filename('dirty')) train_dataset = CombinedDataset(d1, d2) else: raise ValueError("Unkown encoding") trainer = get_trainer(train_dataset, data_collator, model) def validator(x, y): global BEST_metric model.save_pretrained(session) metric, pred = validate(model, tokenizer, x, y) if np.mean(metric) > BEST_metric: print("NEW BEST (saving)") BEST_metric = np.mean(metric) # save predicitions and model save(session + "metric.txt", str(metric) + "\n") save(session + "pred.txt", str(pred) + "\n\n") return metric, pred trainer.validator = validator trainer.val_dataset = get_validation_data() # saving configuration print("SAVING...") session = get_session_path() print(session) save(session + "conf.txt", repr(cfg())) print("STARTING TRAINING...") trainer.train()
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path=None): # train from scrath if model_path=None def _dataset(file_path): return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=tokenizer.max_len) val_dataset = _dataset(args.val_datapath) if eval_only: print("Assign validation dataset") train_dataset = val_dataset else: logger.info( f'Loading and tokenizing training data is usually slow: {args.train_datapath}' ) train_dataset = ConcatDataset([ _dataset(f) for f in glob.glob('./Preprocessed_Data/splited_train/*') ]) print("Creating data collator with mlm") data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) print("Start Trainer") trainer = Trainer( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True, ) if not eval_only: trainer.train(model_path=model_path) # None train from scratch trainer.save_model(args.output_dir) # save model to the output_dir # Evaluation results = {} logger.info("*** Evaluate ***") eval_loss = trainer.evaluate() eval_loss = eval_loss['eval_loss'] perplexity = math.exp(eval_loss) results["perplexity"] = perplexity results["bpc"] = eval_loss / math.log(2) output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") with open(output_eval_file, "a") as writer: writer.write("***** Eval results *****") logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n")
def run(self): result_folder = luigi.configuration.get_config().get( 'GlobalConfig', 'result_folder') model = GPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") train_dataset = TextDataset(tokenizer, self.input()['train'].path, block_size=self.block_size) test_dataset = TextDataset(tokenizer, self.input()['test'].path, block_size=self.block_size) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) training_args = TrainingArguments( do_eval=self.do_eval, do_train=self.do_train, eval_steps=self.eval_steps, evaluate_during_training=self.evaluate_during_training, gradient_accumulation_steps=self.gradient_accumulation_steps, logging_dir='./logs', logging_steps=self.logging_steps, learning_rate=self.learning_rate, max_grad_norm=self.max_grad_norm, num_train_epochs=self.num_train_epochs, output_dir=result_folder, overwrite_output_dir=True, per_device_train_batch_size=self.per_device_train_batch_size, per_device_eval_batch_size=self.per_device_eval_batch_size, save_steps=self.save_steps, seed=self.seed, warmup_steps=self.warmup_steps, weight_decay=self.weight_decay, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset) trainer.train() trainer.save_model() tokenizer.save_pretrained(result_folder) wanb_disabled = os.environ.get('WANDB_DISABLED', False) if wanb_disabled: run_name = time.strftime('%Y%m%d-%H%M%S') else: wandb.run.save() wandb.join() run_name = wandb.run.name with open(self.output()['run_name'].path, 'w') as f: f.write(run_name)
def test_lm_tokenizer_without_padding(self): tokenizer = AutoTokenizer.from_pretrained("gpt2") data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) # ^ causal lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] with self.assertRaises(ValueError): # Expect error due to padding token missing on gpt2: data_collator.collate_batch(examples) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
def __init__(self, config, mode, *args, **params): self.max_len = config.getint("train", "max_len") self.mode = mode self.tokenizer = AutoTokenizer.from_pretrained( "hfl/chinese-roberta-wwm-ext") self.mlm_prob = config.getfloat("train", "mlm_prob") self.data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm_probability=self.mlm_prob)
def get_data_collator( tokenizer: BertTokenizer, mlm: bool = True, mlm_prob: float = 0.15) -> DataCollatorForLanguageModeling: data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm, mlm_probability=mlm_prob) return data_collator
def _collate(self, batch_examples: List) -> Dict: data_collator = DataCollatorForLanguageModeling( tokenizer=self.dataset_reader.encoder.tokenizer, mlm=True, mlm_probability=0.15) return data_collator(batch_examples)
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path): if tokenizer.model_max_length > 1e8: val_dataset = TextDataset(tokenizer=tokenizer, file_path=args.val_datapath, block_size=512) logger.info( f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.' ) else: val_dataset = TextDataset( tokenizer=tokenizer, file_path=args.val_datapath, block_size=tokenizer.model_max_length ) # The `max_len` attribute has been deprecated if eval_only: train_dataset = val_dataset else: logger.info( f'Loading and tokenizing training data is usually slow: {args.train_datapath}' ) if tokenizer.model_max_length > 1e8: train_dataset = TextDataset(tokenizer=tokenizer, file_path=args.train_datapath, block_size=512) logger.info( f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.' ) else: train_dataset = TextDataset(tokenizer=tokenizer, file_path=args.train_datapath, block_size=tokenizer.model_max_length) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) trainer = Trainer( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True, ) eval_loss = trainer.evaluate() #pdb.set_trace() eval_loss = eval_loss['eval_loss'] logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}') if not eval_only: trainer.train(model_path=model_path) trainer.save_model() eval_loss = trainer.evaluate() eval_loss = eval_loss['eval_loss'] logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
def pretrain_and_evaluate(training_args, dataset_args, model, tokenizer, eval_only): """ # adapted from https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=N8J-TLhBuaOf :param training_args: HF training args object :param dataset_args: object storing dataset config, requires train_datapath and val_datapath to be defined :param model: transformers.PreTrainedModel :param tokenizer: PreTrainedTokenizerBase :param eval_only: boolean, True only performs evaluation :return: """ val_dataset = TextDataset( tokenizer=tokenizer, file_path=dataset_args.val_datapath, block_size=tokenizer.model_max_length, ) if eval_only: train_dataset = val_dataset else: logging.info( f"Loading and tokenizing training data is usually slow: {dataset_args.train_datapath}" ) train_dataset = TextDataset( tokenizer=tokenizer, file_path=dataset_args.train_datapath, block_size=tokenizer.model_max_length, ) # https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) # https://huggingface.co/transformers/_modules/transformers/trainer.html trainer = Trainer_( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, ) metrics = trainer.evaluate() # eval_loss = metrics["eval_loss"] # logging.info(f"Initial eval bpc: {eval_loss / math.log(2)}") logging.info(f"Initial metrics: {metrics}") if not eval_only: # to change if we want to continue training existing models # same path as from_checkpoint argument from the builder trainer.train(model_path=None) trainer.save_model() metrics = trainer.evaluate() eval_loss = metrics["eval_loss"] logging.info(f"Eval bpc after pretraining: {eval_loss / math.log(2)}")
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path): val_dataset, train_dataset = None, None for val_datapath in args.val_datapath: if not val_dataset: val_dataset = CustomIterableDataset( tokenizer=tokenizer, file_path=val_datapath, block_size=model.config.max_position_embeddings) else: val_dataset += CustomIterableDataset( tokenizer=tokenizer, file_path=val_datapath, block_size=model.config.max_position_embeddings) if eval_only: train_dataset = val_dataset else: logger.info( f'Loading and tokenizing training data is usually slow: {args.train_datapath}' ) for train_datapath in args.train_datapath: if not train_dataset: train_dataset = CustomIterableDataset( tokenizer=tokenizer, file_path=train_datapath, block_size=model.config.max_position_embeddings, ) else: train_dataset += CustomIterableDataset( tokenizer=tokenizer, file_path=train_datapath, block_size=model.config.max_position_embeddings, ) # train_dataset = val_dataset data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) trainer = CustomTrainer( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, prediction_loss_only=True, ) eval_loss = trainer.evaluate() eval_loss = eval_loss['eval_loss'] logger.info(f'Initial eval bpc: {eval_loss / math.log(2)}') if not eval_only: trainer.train(model_path=model_path) trainer.save_model() eval_loss = trainer.evaluate() eval_loss = eval_loss['eval_loss'] logger.info(f'Eval bpc after pretraining: {eval_loss / math.log(2)}')
def test_lm_tokenizer_with_padding(self): tokenizer = AutoTokenizer.from_pretrained("distilroberta-base") data_collator = DataCollatorForLanguageModeling(tokenizer) # ^ masked lm dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107))) dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True) examples = [dataset[i] for i in range(len(dataset))] batch = data_collator.collate_batch(examples) self.assertIsInstance(batch, dict) self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512))) self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))
def finetune_model(transformers_model_name: str, corpus_file_path: str): config = AutoConfig.from_pretrained( transformers_model_name, force_download=False, cache_dir='../data/download_transformer_models') tokenizer = AutoTokenizer.from_pretrained( transformers_model_name, force_download=False, cache_dir='../data/download_transformer_models') # tokenizer = RobertaTokenizerFast.from_pretrained(transformers_model_name,force_download=False,cache_dir='../data/download_transformer_models') model = AutoModelForMaskedLM.from_pretrained( transformers_model_name, config=config, force_download=False, cache_dir='../data/download_transformer_models') dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path=corpus_file_path, block_size=512) train_set, valid_set = train_test_split(dataset, test_size=0.25, random_state=32) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments( output_dir="../data/finetune_transformer_models/", logging_dir='../saved/finetune_logging', logging_steps=500, overwrite_output_dir=True, weight_decay=0.01, adam_epsilon=1e-6, learning_rate=2e-5, num_train_epochs=5, per_gpu_train_batch_size=4, per_gpu_eval_batch_size=32, max_grad_norm=5.0, save_steps=1000, save_total_limit=2, gradient_accumulation_steps=32, evaluate_during_training=True, do_train=True, do_eval=True, do_predict=False) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_set, eval_dataset=valid_set, ) os.environ["CUDA_VISIBLE_DEVICES"] = "0" trainer.train()