def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint) model.save_pretrained(args.output)
def create_and_check_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForMaskedLM(config) load_tf_weights_in_albert(model, config, tf_checkpoint_path) print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert( model, config, args.checkpoint) model.save_pretrained(args.output) tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True) tokenizer.save_pretrained(args.output)
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def albert_convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForMaskedLM(config) # Load weights from tf checkpoint load_tf_weights_in_albert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'tiny': num_hidden_layers = 4 embedding_size = 128 hidden_size = 336 intermediate_size = 1344 num_attention_heads = 12 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 768 hidden_size = 768 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) model = AlbertForMaskedLM(config) return model
def main(tokenizer_path, dataset_path, save_path='alectra-small', max_steps=1e6, accumulate_grad_batches=1, gpus=None, num_tpu_cores=None, distributed_backend=None, val_check_interval=0.25, val_check_percent=0.25, generator_type='albert', num_hidden_groups=1, d_loss_weight=50, mlm_prob=0.15, learning_rate=5e-4, warmup_steps=10000, batch_size=128, num_workers=2, tie_embedding_proj=False, tie_encoder=True, shuffle=True, lr_schedule='linear', resume_from_checkpoint=None, use_polyaxon=False): # init tokenizer. only need it for the special chars. tokenizer = BertWordPieceTokenizer(tokenizer_path) # init generator. if generator_type == 'albert': generator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=3, num_attention_heads=1, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) generator = AlbertForMaskedLM(generator_config) elif generator_type == 'bert': generator_config = BertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=128, num_hidden_layers=3, num_attention_heads=1, intermediate_size=256, max_position_embeddings=128) generator = BertForMaskedLM(generator_config) tie_weights(generator.cls.predictions.decoder, generator.bert.embeddings.word_embeddings) else: raise Exception(f"invalid generator type: {generator_type}") # init discriminator. discriminator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=12, num_attention_heads=4, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) discriminator = AlbertForTokenClassification(discriminator_config) # tie the embeddingg weights. tie_weights(discriminator.base_model.embeddings.word_embeddings, generator.base_model.embeddings.word_embeddings) tie_weights(discriminator.base_model.embeddings.position_embeddings, generator.base_model.embeddings.position_embeddings) tie_weights(discriminator.base_model.embeddings.token_type_embeddings, generator.base_model.embeddings.token_type_embeddings) if generator_type == 'albert' and tie_encoder: print('tying albert encoder layers') discriminator.albert.encoder.albert_layer_groups = generator.albert.encoder.albert_layer_groups if generator_type == 'albert' and tie_embedding_proj: print('tying embedding projection layers') discriminator.albert.encoder.embedding_hidden_mapping_in = generator.albert.encoder.embedding_hidden_mapping_in # init training module. training_config = DiscLMTrainingModuleConfig(max_steps, d_loss_weight=d_loss_weight, save_path=save_path, weight_decay=0.01, learning_rate=learning_rate, epsilon=1e-6, lr_schedule=lr_schedule, warmup_steps=warmup_steps) if use_polyaxon: checkpoint_fn = polyaxon_checkpoint_fn else: checkpoint_fn = None lightning_module = DiscLMTrainingModule(generator, discriminator, training_config, checkpoint_fn=checkpoint_fn) # init trainer. trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches, gpus=gpus, num_tpu_cores=num_tpu_cores, distributed_backend=distributed_backend, max_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint, val_check_percent=val_check_percent, val_check_interval=val_check_interval) # init dataloaders. train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path, trainer, mlm_prob, batch_size, num_workers, shuffle) # train. trainer.fit(lightning_module, train_loader, val_loader) # save the model. output_path = os.path.join(save_path, 'discriminator', 'final') os.makedirs(output_path, exist_ok=True) lightning_module.discriminator.base_model.save_pretrained(output_path) if checkpoint_fn: checkpoint_fn(lightning_module)
def init_process(local_rank, backend, config, albert_config, logger): os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' torch.cuda.set_device(local_rank) random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if local_rank != 0: logger.setLevel(logging.WARNING) if local_rank == 0: writer = SummaryWriter() if not os.path.exists("save"): os.mkdir("save") save_path = "save/model_{}.pt".format(re.sub("\s+", "_", time.asctime())) reader = Reader(config) start = time.time() logger.info("Loading data...") reader.load_data() end = time.time() logger.info("Loaded. {} secs".format(end-start)) model = AlbertForMaskedLM(albert_config).cuda() optimizer = Adam(model.parameters(), lr=config.lr) if config.save_path is not None: load(model, optimizer, config.save_path, local_rank) train.global_step = 0 train.max_iter = len(list(reader.make_batch("train"))) validate.max_iter = len(list(reader.make_batch("dev"))) min_loss = 1e+10 early_stop_count = config.early_stop_count # logger.info("Validate...") # loss = validate(model, reader, config, local_rank) # logger.info("loss: {:.4f}".format(loss)) for epoch in range(config.max_epochs): logger.info("Train...") start = time.time() if local_rank == 0: train_test(model, reader, optimizer, config, local_rank, writer) else: train_test(model, reader, optimizer, config, local_rank) exit(0) end = time.time() logger.info("epoch: {}, {:.4f} secs".format(epoch+1, end-start)) logger.info("Validate...") loss = validate(model, reader, config, local_rank) logger.info("loss: {:.4f}".format(loss)) if local_rank == 0: writer.add_scalar("Val/loss", loss, epoch+1) if loss < min_loss: # save model if local_rank == 0: save(model, optimizer, save_path) logger.info("Saved to {}.".format(os.path.abspath(save_path))) min_loss = loss early_stop_count = config.early_stop_count else: # ealry stopping if early_stop_count == 0: if epoch < config.min_epochs: early_stop_count += 1 logger.info("Too early to stop training.") logger.info("early stop count: {}".format(early_stop_count)) else: logger.info("Early stopped.") break elif early_stop_count == 2: lr = lr / 2 logger.info("learning rate schedule: {}".format(lr)) for param in optimizer.param_groups: param["lr"] = lr early_stop_count -= 1 logger.info("early stop count: {}".format(early_stop_count)) logger.info("Training finished.")
def __init__(self, args, random_init='none'): assert (random_init in ['none', 'all', 'embedding']) super().__init__() self._model_device = 'cpu' model_name = args.model_name vocab_name = model_name if args.model_dir is not None: # load bert model from file model_name = str(args.model_dir) + "/" vocab_name = model_name logger.info("loading BERT model from {}".format(model_name)) # Load pre-trained model tokenizer (vocabulary) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if torch.cuda.device_count() > 1: torch.cuda.manual_seed_all(args.seed) config = AutoConfig.from_pretrained(model_name) if isinstance(config, AlbertConfig): self.model_type = 'albert' self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name) self.mlm_model = AlbertForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = AlbertForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.albert elif isinstance(config, RobertaConfig): self.model_type = 'roberta' self.tokenizer = RobertaTokenizer.from_pretrained(vocab_name) self.mlm_model = RobertaForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = RobertaForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.roberta elif isinstance(config, BertConfig): self.model_type = 'bert' self.tokenizer = BertTokenizer.from_pretrained(vocab_name) self.mlm_model = BertForMaskedLM.from_pretrained(model_name) if random_init == 'all': logger.info('Random initialize model...') self.mlm_model = BertForMaskedLM(self.mlm_model.config) self.base_model = self.mlm_model.bert else: raise ValueError('Model %s not supported yet!' % (model_name)) self.mlm_model.eval() if random_init == 'embedding': logger.info('Random initialize embedding layer...') self.mlm_model._init_weights( self.base_model.embeddings.word_embeddings) # original vocab self.map_indices = None self.vocab = list(self.tokenizer.get_vocab().keys()) logger.info('Vocab size: %d' % len(self.vocab)) self._init_inverse_vocab() self.MASK = self.tokenizer.mask_token self.EOS = self.tokenizer.eos_token self.CLS = self.tokenizer.cls_token self.SEP = self.tokenizer.sep_token self.UNK = self.tokenizer.unk_token # print(self.MASK, self.EOS, self.CLS, self.SEP, self.UNK) self.pad_id = self.inverse_vocab[self.tokenizer.pad_token] self.unk_index = self.inverse_vocab[self.tokenizer.unk_token] # used to output top-k predictions self.k = args.k
def main(tokenizer_path, dataset_path, save_path='albert-small', max_steps=1.5e6, accumulate_grad_batches=1, gpus=None, num_tpu_cores=None, distributed_backend=None, val_check_interval=0.25, mlm_prob=0.15, learning_rate=5e-4, warmup_steps=10000, batch_size=128, num_workers=2, shuffle=True, use_polyaxon=False): # init tokenizer. only need it for the special chars. tokenizer = BertWordPieceTokenizer(tokenizer_path) # init transformer. albert_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=12, num_attention_heads=4, intermediate_size=1024, max_position_embeddings=128) albert = AlbertForMaskedLM(albert_config) # init training module. training_config = LMTrainingModuleConfig(max_steps, mlm=True, save_path=save_path, weight_decay=0.01, learning_rate=learning_rate, epsilon=1e-6, warmup_steps=warmup_steps) if use_polyaxon: checkpoint_fn = polyaxon_checkpoint_fn else: checkpoint_fn = None lightning_module = LMTrainingModule(albert, training_config, checkpoint_fn=checkpoint_fn) # init trainer. trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches, gpus=gpus, num_tpu_cores=num_tpu_cores, distributed_backend=distributed_backend, max_steps=max_steps, val_check_interval=val_check_interval) # init dataloaders. train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path, trainer, mlm_prob, batch_size, num_workers, shuffle) # train. trainer.fit(lightning_module, train_loader, val_loader) # save the model. output_path = os.path.join(save_path, 'final') os.makedirs(output_path, exist_ok=True) lightning_module.model.base_model.save_pretrained(output_path) if checkpoint_fn: checkpoint_fn(lightning_module)