def main(args):
    with open(args.config) as fp:
        data = json.loads(fp.read())
    config = AlbertConfig(**data)
    model = AlbertForMaskedLM(config)
    model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint)
    model.save_pretrained(args.output)
 def create_and_check_for_masked_lm(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = AlbertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
示例#3
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
def main(args):
    with open(args.config) as fp:
        data = json.loads(fp.read())
    config = AlbertConfig(**data)
    model = AlbertForMaskedLM(config)
    model: AlbertForMaskedLM = load_tf_weights_in_albert(
        model, config, args.checkpoint)
    model.save_pretrained(args.output)

    tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True)
    tokenizer.save_pretrained(args.output)
 def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
     model = AlbertForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.seq_length, self.vocab_size])
     self.check_loss_output(result)
示例#6
0
文件: covert.py 项目: SunYanCN/BAND
def albert_convert_tf_checkpoint_to_pytorch(tf_checkpoint_path,
                                            albert_config_file,
                                            pytorch_dump_path):
    from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
    # Initialise PyTorch model
    config = AlbertConfig.from_json_file(albert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = AlbertForMaskedLM(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_albert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
示例#7
0
def get_model(args):
    if args.model_size == 'debug':
        num_hidden_layers = 1
        embedding_size = 8
        hidden_size = 16
        intermediate_size = 32
        num_attention_heads = 2
        args.gen_ratio = 2

    elif args.model_size == 'tiny':
        num_hidden_layers = 4
        embedding_size = 128
        hidden_size = 336
        intermediate_size = 1344
        num_attention_heads = 12
    elif args.model_size == 'small':
        num_hidden_layers = 12
        embedding_size = 128
        hidden_size = 256
        intermediate_size = 1024
        num_attention_heads = 4
    elif args.model_size == 'base':
        num_hidden_layers = 12
        embedding_size = 768
        hidden_size = 768
        intermediate_size = 3072
        num_attention_heads = 12

    else:
        raise Exception('Which model? small, base, large')

    config = AlbertConfig(
        max_position_embeddings=args.seq_length,
        vocab_size=args.vocab_size,
        num_hidden_layers=num_hidden_layers,
        embedding_size=embedding_size,
        hidden_size=hidden_size // args.gen_ratio,
        intermediate_size=intermediate_size // args.gen_ratio,
        num_attention_heads=num_attention_heads // args.gen_ratio,
    )

    model = AlbertForMaskedLM(config)
    return model
def main(tokenizer_path,
         dataset_path,
         save_path='alectra-small',
         max_steps=1e6,
         accumulate_grad_batches=1,
         gpus=None,
         num_tpu_cores=None,
         distributed_backend=None,
         val_check_interval=0.25,
         val_check_percent=0.25,
         generator_type='albert',
         num_hidden_groups=1,
         d_loss_weight=50,
         mlm_prob=0.15,
         learning_rate=5e-4,
         warmup_steps=10000,
         batch_size=128,
         num_workers=2,
         tie_embedding_proj=False,
         tie_encoder=True,
         shuffle=True,
         lr_schedule='linear',
         resume_from_checkpoint=None,
         use_polyaxon=False):
    # init tokenizer.  only need it for the special chars.
    tokenizer = BertWordPieceTokenizer(tokenizer_path)

    # init generator.
    if generator_type == 'albert':
        generator_config = AlbertConfig(
            vocab_size=tokenizer._tokenizer.get_vocab_size(),
            hidden_size=256,
            embedding_size=128,
            num_hidden_layers=3,
            num_attention_heads=1,
            num_hidden_groups=num_hidden_groups,
            intermediate_size=1024,
            hidden_dropout_prob=0.1,
            attention_probs_dropout_prob=0.1,
            classifier_dropout_prob=0.1,
            max_position_embeddings=128)
        generator = AlbertForMaskedLM(generator_config)
    elif generator_type == 'bert':
        generator_config = BertConfig(
            vocab_size=tokenizer._tokenizer.get_vocab_size(),
            hidden_size=128,
            num_hidden_layers=3,
            num_attention_heads=1,
            intermediate_size=256,
            max_position_embeddings=128)
        generator = BertForMaskedLM(generator_config)
        tie_weights(generator.cls.predictions.decoder,
                    generator.bert.embeddings.word_embeddings)
    else:
        raise Exception(f"invalid generator type: {generator_type}")

    # init discriminator.
    discriminator_config = AlbertConfig(
        vocab_size=tokenizer._tokenizer.get_vocab_size(),
        hidden_size=256,
        embedding_size=128,
        num_hidden_layers=12,
        num_attention_heads=4,
        num_hidden_groups=num_hidden_groups,
        intermediate_size=1024,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
        classifier_dropout_prob=0.1,
        max_position_embeddings=128)
    discriminator = AlbertForTokenClassification(discriminator_config)

    # tie the embeddingg weights.
    tie_weights(discriminator.base_model.embeddings.word_embeddings,
                generator.base_model.embeddings.word_embeddings)
    tie_weights(discriminator.base_model.embeddings.position_embeddings,
                generator.base_model.embeddings.position_embeddings)
    tie_weights(discriminator.base_model.embeddings.token_type_embeddings,
                generator.base_model.embeddings.token_type_embeddings)

    if generator_type == 'albert' and tie_encoder:
        print('tying albert encoder layers')
        discriminator.albert.encoder.albert_layer_groups = generator.albert.encoder.albert_layer_groups
    if generator_type == 'albert' and tie_embedding_proj:
        print('tying embedding projection layers')
        discriminator.albert.encoder.embedding_hidden_mapping_in = generator.albert.encoder.embedding_hidden_mapping_in

    # init training module.
    training_config = DiscLMTrainingModuleConfig(max_steps,
                                                 d_loss_weight=d_loss_weight,
                                                 save_path=save_path,
                                                 weight_decay=0.01,
                                                 learning_rate=learning_rate,
                                                 epsilon=1e-6,
                                                 lr_schedule=lr_schedule,
                                                 warmup_steps=warmup_steps)
    if use_polyaxon:
        checkpoint_fn = polyaxon_checkpoint_fn
    else:
        checkpoint_fn = None
    lightning_module = DiscLMTrainingModule(generator,
                                            discriminator,
                                            training_config,
                                            checkpoint_fn=checkpoint_fn)

    # init trainer.
    trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches,
                      gpus=gpus,
                      num_tpu_cores=num_tpu_cores,
                      distributed_backend=distributed_backend,
                      max_steps=max_steps,
                      resume_from_checkpoint=resume_from_checkpoint,
                      val_check_percent=val_check_percent,
                      val_check_interval=val_check_interval)

    # init dataloaders.
    train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path,
                                                  trainer, mlm_prob,
                                                  batch_size, num_workers,
                                                  shuffle)

    # train.
    trainer.fit(lightning_module, train_loader, val_loader)

    # save the model.
    output_path = os.path.join(save_path, 'discriminator', 'final')
    os.makedirs(output_path, exist_ok=True)
    lightning_module.discriminator.base_model.save_pretrained(output_path)
    if checkpoint_fn:
        checkpoint_fn(lightning_module)
示例#9
0
def init_process(local_rank, backend, config, albert_config, logger):
    os.environ['MASTER_ADDR'] = '127.0.0.1'
    os.environ['MASTER_PORT'] = '29500'
    torch.cuda.set_device(local_rank)

    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if local_rank != 0:
        logger.setLevel(logging.WARNING)
    
    if local_rank == 0:
        writer = SummaryWriter()
        if not os.path.exists("save"):
            os.mkdir("save")
        save_path = "save/model_{}.pt".format(re.sub("\s+", "_", time.asctime()))

    reader = Reader(config)
    start = time.time()
    logger.info("Loading data...")
    reader.load_data()
    end = time.time()
    logger.info("Loaded. {} secs".format(end-start))

    model = AlbertForMaskedLM(albert_config).cuda()
    optimizer = Adam(model.parameters(), lr=config.lr)

    if config.save_path is not None:
        load(model, optimizer, config.save_path, local_rank)

    train.global_step = 0
    train.max_iter = len(list(reader.make_batch("train")))
    validate.max_iter = len(list(reader.make_batch("dev")))

    min_loss = 1e+10
    early_stop_count = config.early_stop_count

    # logger.info("Validate...")
    # loss = validate(model, reader, config, local_rank)
    # logger.info("loss: {:.4f}".format(loss))

    for epoch in range(config.max_epochs):
        logger.info("Train...")
        start = time.time()

        if local_rank == 0:
            train_test(model, reader, optimizer, config, local_rank, writer)
        else:
            train_test(model, reader, optimizer, config, local_rank)
        
        exit(0)

        end = time.time()
        logger.info("epoch: {}, {:.4f} secs".format(epoch+1, end-start))

        logger.info("Validate...")
        loss = validate(model, reader, config, local_rank)
        logger.info("loss: {:.4f}".format(loss))
        
        if local_rank == 0:
            writer.add_scalar("Val/loss", loss, epoch+1)

        if loss < min_loss:  # save model
            if local_rank == 0:
                save(model, optimizer, save_path)
                logger.info("Saved to {}.".format(os.path.abspath(save_path)))
            
            min_loss = loss
            early_stop_count = config.early_stop_count
        else:  # ealry stopping
            if early_stop_count == 0:
                if epoch < config.min_epochs:
                    early_stop_count += 1
                    logger.info("Too early to stop training.")
                    logger.info("early stop count: {}".format(early_stop_count))
                else:
                    logger.info("Early stopped.")
                    break
            elif early_stop_count == 2:
                lr = lr / 2
                logger.info("learning rate schedule: {}".format(lr))
                for param in optimizer.param_groups:
                    param["lr"] = lr
            early_stop_count -= 1
            logger.info("early stop count: {}".format(early_stop_count))
    logger.info("Training finished.")
示例#10
0
    def __init__(self, args, random_init='none'):
        assert (random_init in ['none', 'all', 'embedding'])

        super().__init__()

        self._model_device = 'cpu'

        model_name = args.model_name
        vocab_name = model_name

        if args.model_dir is not None:
            # load bert model from file
            model_name = str(args.model_dir) + "/"
            vocab_name = model_name
            logger.info("loading BERT model from {}".format(model_name))

        # Load pre-trained model tokenizer (vocabulary)
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)
        if torch.cuda.device_count() > 1:
            torch.cuda.manual_seed_all(args.seed)

        config = AutoConfig.from_pretrained(model_name)
        if isinstance(config, AlbertConfig):
            self.model_type = 'albert'
            self.tokenizer = AlbertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = AlbertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = AlbertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.albert
        elif isinstance(config, RobertaConfig):
            self.model_type = 'roberta'
            self.tokenizer = RobertaTokenizer.from_pretrained(vocab_name)
            self.mlm_model = RobertaForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = RobertaForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.roberta
        elif isinstance(config, BertConfig):
            self.model_type = 'bert'
            self.tokenizer = BertTokenizer.from_pretrained(vocab_name)
            self.mlm_model = BertForMaskedLM.from_pretrained(model_name)
            if random_init == 'all':
                logger.info('Random initialize model...')
                self.mlm_model = BertForMaskedLM(self.mlm_model.config)
            self.base_model = self.mlm_model.bert
        else:
            raise ValueError('Model %s not supported yet!' % (model_name))

        self.mlm_model.eval()

        if random_init == 'embedding':
            logger.info('Random initialize embedding layer...')
            self.mlm_model._init_weights(
                self.base_model.embeddings.word_embeddings)

        # original vocab
        self.map_indices = None
        self.vocab = list(self.tokenizer.get_vocab().keys())
        logger.info('Vocab size: %d' % len(self.vocab))
        self._init_inverse_vocab()

        self.MASK = self.tokenizer.mask_token
        self.EOS = self.tokenizer.eos_token
        self.CLS = self.tokenizer.cls_token
        self.SEP = self.tokenizer.sep_token
        self.UNK = self.tokenizer.unk_token
        # print(self.MASK, self.EOS, self.CLS, self.SEP, self.UNK)

        self.pad_id = self.inverse_vocab[self.tokenizer.pad_token]
        self.unk_index = self.inverse_vocab[self.tokenizer.unk_token]

        # used to output top-k predictions
        self.k = args.k
def main(tokenizer_path,
         dataset_path,
         save_path='albert-small',
         max_steps=1.5e6,
         accumulate_grad_batches=1,
         gpus=None,
         num_tpu_cores=None,
         distributed_backend=None,
         val_check_interval=0.25,
         mlm_prob=0.15,
         learning_rate=5e-4,
         warmup_steps=10000,
         batch_size=128,
         num_workers=2,
         shuffle=True,
         use_polyaxon=False):
    # init tokenizer.  only need it for the special chars.
    tokenizer = BertWordPieceTokenizer(tokenizer_path)

    # init transformer.
    albert_config = AlbertConfig(
        vocab_size=tokenizer._tokenizer.get_vocab_size(),
        hidden_size=256,
        embedding_size=128,
        num_hidden_layers=12,
        num_attention_heads=4,
        intermediate_size=1024,
        max_position_embeddings=128)
    albert = AlbertForMaskedLM(albert_config)

    # init training module.
    training_config = LMTrainingModuleConfig(max_steps,
                                             mlm=True,
                                             save_path=save_path,
                                             weight_decay=0.01,
                                             learning_rate=learning_rate,
                                             epsilon=1e-6,
                                             warmup_steps=warmup_steps)
    if use_polyaxon:
        checkpoint_fn = polyaxon_checkpoint_fn
    else:
        checkpoint_fn = None
    lightning_module = LMTrainingModule(albert,
                                        training_config,
                                        checkpoint_fn=checkpoint_fn)

    # init trainer.
    trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches,
                      gpus=gpus,
                      num_tpu_cores=num_tpu_cores,
                      distributed_backend=distributed_backend,
                      max_steps=max_steps,
                      val_check_interval=val_check_interval)

    # init dataloaders.
    train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path,
                                                  trainer, mlm_prob,
                                                  batch_size, num_workers,
                                                  shuffle)

    # train.
    trainer.fit(lightning_module, train_loader, val_loader)

    # save the model.
    output_path = os.path.join(save_path, 'final')
    os.makedirs(output_path, exist_ok=True)
    lightning_module.model.base_model.save_pretrained(output_path)
    if checkpoint_fn:
        checkpoint_fn(lightning_module)