def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file,
                                     pytorch_dump_path,
                                     discriminator_or_generator):
    # Initialise PyTorch model
    config = ElectraConfig.from_json_file(config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))

    if discriminator_or_generator == "discriminator":
        model = ElectraForPreTraining(config)
    elif discriminator_or_generator == "generator":
        model = ElectraForMaskedLM(config)
    else:
        raise ValueError(
            "The discriminator_or_generator argument should be either 'discriminator' or 'generator'"
        )

    # Load weights from tf checkpoint
    load_tf_weights_in_electra(
        model,
        config,
        tf_checkpoint_path,
        discriminator_or_generator=discriminator_or_generator)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
示例#2
0
class ElectraForLanguageModelingModel(PreTrainedModel):
    def __init__(self, config, **kwargs):
        super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
        if "generator_config" in kwargs:
            generator_config = kwargs["generator_config"]
        else:
            generator_config = config
        self.generator_model = ElectraForMaskedLM(generator_config)
        if "discriminator_config" in kwargs:
            discriminator_config = kwargs["discriminator_config"]
        else:
            discriminator_config = config
        self.discriminator_model = ElectraForPreTraining(discriminator_config)
        self.vocab_size = generator_config.vocab_size
        if kwargs.get("tie_generator_and_discriminator_embeddings", True):
            self.tie_generator_and_discriminator_embeddings()

    def tie_generator_and_discriminator_embeddings(self):
        self.discriminator_model.set_input_embeddings(self.generator_model.get_input_embeddings())

    def forward(self, inputs, masked_lm_labels, attention_mask=None, token_type_ids=None):
        d_inputs = inputs.clone()

        # run masked LM.
        g_out = self.generator_model(
            inputs, masked_lm_labels=masked_lm_labels, attention_mask=attention_mask, token_type_ids=token_type_ids
        )

        # get samples from masked LM.
        sample_probs = torch.softmax(g_out[1], dim=-1, dtype=torch.float32)
        sample_probs = sample_probs.view(-1, self.vocab_size)

        sampled_tokens = torch.multinomial(sample_probs, 1).view(-1)
        sampled_tokens = sampled_tokens.view(d_inputs.shape[0], -1)

        # labels have a -100 value to mask out loss from unchanged tokens.
        mask = masked_lm_labels.ne(-100)

        # replace the masked out tokens of the input with the generator predictions.
        d_inputs[mask] = sampled_tokens[mask]

        # turn mask into new target labels.  1 (True) for corrupted, 0 otherwise.
        # if the prediction was correct, mark it as uncorrupted.
        correct_preds = sampled_tokens == masked_lm_labels
        d_labels = mask.long()
        d_labels[correct_preds] = 0

        # run token classification, predict whether each token was corrupted.
        d_out = self.discriminator_model(
            d_inputs, labels=d_labels, attention_mask=attention_mask, token_type_ids=token_type_ids
        )

        g_loss = g_out[0]
        d_loss = d_out[0]
        g_scores = g_out[1]
        d_scores = d_out[1]
        return g_loss, d_loss, g_scores, d_scores, d_labels
示例#3
0
 def __init__(self, config, **kwargs):
     super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
     if "generator_config" in kwargs:
         generator_config = kwargs["generator_config"]
     else:
         generator_config = config
     self.generator_model = ElectraForMaskedLM(generator_config)
     if "discriminator_config" in kwargs:
         discriminator_config = kwargs["discriminator_config"]
     else:
         discriminator_config = config
     self.discriminator_model = ElectraForPreTraining(discriminator_config)
     self.vocab_size = generator_config.vocab_size
     if kwargs.get("tie_generator_and_discriminator_embeddings", True):
         self.tie_generator_and_discriminator_embeddings()
示例#4
0
 def __init__(self, config: ElectraConfig, embeddings):
     super().__init__()
     self.embed_layer = nn.Embedding(num_embeddings=config.vocab_size,
                                     embedding_dim=config.embedding_size,
                                     padding_idx=config.vocab_size - 1)
     self.embed_layer.weight = nn.Parameter(embeddings)
     self.discriminator = ElectraForPreTraining(config)
     self.sigmoid = nn.Sigmoid()
示例#5
0
def main(train_cfg='config/electra_pretrain.json',
         model_cfg='config/electra_small.json',
         data_file='../tbc/books_large_all.txt',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/electra/pretrain',
         max_len=128,
         max_pred=20,
         mask_prob=0.15,
         quantize=False):

    check_dirs_exist([log_dir, save_dir])

    train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids, max_len)
    ]

    data_iter = SentPairDataLoader(data_file,
                                   train_cfg.batch_size,
                                   tokenize,
                                   max_len,
                                   pipeline=pipeline)

    # Get distilled-electra and quantized-distilled-electra
    generator = ElectraForMaskedLM.from_pretrained(
        'google/electra-small-generator')
    t_discriminator = ElectraForPreTraining.from_pretrained(
        'google/electra-base-discriminator')
    s_discriminator = QuantizedElectraForPreTraining(
        model_cfg) if quantize else ElectraForPreTraining
    s_discriminator = s_discriminator.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg)  # model
    # config is used for model "QuantizedElectraForPreTraining"
    model = DistillElectraForPreTraining(generator, t_discriminator,
                                         s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, None,
                         optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)
    trainer.train(model_file, None, data_parallel)
    trainer._eval()
 def create_and_check_electra_for_pretraining(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     config.num_labels = self.num_labels
     model = ElectraForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     loss, logits = model(input_ids,
                          attention_mask=input_mask,
                          token_type_ids=token_type_ids,
                          labels=fake_token_labels)
     result = {
         "loss": loss,
         "logits": logits,
     }
     self.parent.assertListEqual(list(result["logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.check_loss_output(result)
示例#7
0
 def __init__(self, config, **kwargs):
     super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
     if "generator_config" in kwargs:
         generator_config = kwargs["generator_config"]
     else:
         generator_config = config
     self.generator_model = ElectraForMaskedLM(generator_config)
     if "discriminator_config" in kwargs:
         discriminator_config = kwargs["discriminator_config"]
     else:
         discriminator_config = config
     self.discriminator_model = ElectraForPreTraining(discriminator_config)
     self.vocab_size = config.vocab_size
    def test_inference_no_head_absolute_embedding(self):
        model = ElectraForPreTraining.from_pretrained(
            "google/electra-small-discriminator")
        input_ids = torch.tensor(
            [[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]])
        output = model(input_ids)[0]
        expected_shape = torch.Size((1, 11))
        self.assertEqual(output.shape, expected_shape)
        expected_slice = torch.tensor([[
            -8.9253, -4.0305, -3.9306, -3.8774, -4.1873, -4.1280, 0.9429,
            -4.1672, 0.9281, 0.0410, -3.4823
        ]])

        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
示例#9
0
def load_huggingface_weights_in_electra(model, config, pytorch_model_path):
    try:
        from transformers import ElectraForPreTraining
    except ImportError:
        raise ImportError(
            "cannot import transformers, please install transformers first")

    hf_model = ElectraForPreTraining.from_pretrained(pytorch_model_path)
    model.electra.embeddings.token_embeddings.weight = (
        hf_model.electra.embeddings.word_embeddings.weight)

    model.electra.embeddings.position_embeddings.weight = (
        hf_model.electra.embeddings.position_embeddings.weight)
    model.electra.embeddings.token_type_embeddings.weight = (
        hf_model.electra.embeddings.token_type_embeddings.weight)
    model.electra.embeddings.layer_norm.weight = hf_model.electra.embeddings.LayerNorm.weight
    model.electra.embeddings.layer_norm.bias = hf_model.electra.embeddings.LayerNorm.bias

    if config.embedding_size != config.hidden_size:
        model.electra.embeddings_project.weight = hf_model.electra.embeddings_project.weight

    for layer_idx in range(config.num_hidden_layers):
        layer = model.electra.encoder.layers[layer_idx]
        hf_layer = hf_model.electra.encoder.layer[layer_idx]

        layer.self.query.weight = hf_layer.attention.self.query.weight
        layer.self.query.bias = hf_layer.attention.self.query.bias

        layer.self.key.weight = hf_layer.attention.self.key.weight
        layer.self.key.bias = hf_layer.attention.self.key.bias

        layer.self.value.weight = hf_layer.attention.self.value.weight
        layer.self.value.bias = hf_layer.attention.self.value.bias

        layer.self.dense.weight = hf_layer.attention.output.dense.weight
        layer.self.dense.bias = hf_layer.attention.output.dense.bias

        layer.feed_forward.intermediate.weight = hf_layer.intermediate.dense.weight
        layer.feed_forward.intermediate.bias = hf_layer.intermediate.dense.bias

        layer.feed_forward.output.weight = hf_layer.output.dense.weight
        layer.feed_forward.output.bias = hf_layer.output.dense.bias

        layer.add_norm[
            0].layer_norm.weight = hf_layer.attention.output.LayerNorm.weight
        layer.add_norm[
            0].layer_norm.bias = hf_layer.attention.output.LayerNorm.bias
        layer.add_norm[1].layer_norm.weight = hf_layer.output.LayerNorm.weight
        layer.add_norm[1].layer_norm.bias = hf_layer.output.LayerNorm.bias
示例#10
0
def detect_error_demo():
    tokenizer = ElectraTokenizer.from_pretrained(D_model_dir)
    discriminator = ElectraForPreTraining.from_pretrained(D_model_dir)

    sentence = '今天新情很好'
    fake_tokens = tokenizer.tokenize(sentence)
    print(fake_tokens)
    fake_inputs = tokenizer.encode(sentence, return_tensors="pt")

    discriminator_outputs = discriminator(fake_inputs)
    predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)

    print(list(zip(fake_tokens, predictions.tolist())))
    print("fixed " + '*' * 42)
    print(list(zip(fake_tokens, predictions.tolist()[1:-1])))
示例#11
0
    def __init__(self, d_mdel_dir=os.path.join(pwd_path,
                                               "../data/electra_models/chinese_electra_base_discriminator_pytorch/"),
                 g_model_dir=os.path.join(pwd_path,
                                          "../data/electra_models/chinese_electra_base_generator_pytorch/"),
                 ):
        super(ElectraCorrector, self).__init__()
        self.name = 'electra_corrector'
        t1 = time.time()
        self.g_model = pipeline("fill-mask",
                                model=g_model_dir,
                                tokenizer=g_model_dir
                                )
        self.d_model = ElectraForPreTraining.from_pretrained(d_mdel_dir)

        if self.g_model:
            self.mask = self.g_model.tokenizer.mask_token
            logger.debug('Loaded electra model: %s, spend: %.3f s.' % (g_model_dir, time.time() - t1))
示例#12
0
    def __init__(self,
                 d_model_dir=config.electra_D_model_dir,
                 g_model_dir=config.electra_G_model_dir,
                 device=device_id):
        super(ElectraCorrector, self).__init__()
        self.name = 'electra_corrector'
        t1 = time.time()
        self.g_model = pipeline(
            "fill-mask",
            model=g_model_dir,
            tokenizer=g_model_dir,
            device=device,  # gpu device id
        )
        self.d_model = ElectraForPreTraining.from_pretrained(d_model_dir)

        if self.g_model:
            self.mask = self.g_model.tokenizer.mask_token
            logger.debug('Loaded electra model: %s, spend: %.3f s.' %
                         (g_model_dir, time.time() - t1))
 def create_and_check_electra_for_pretraining(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     config.num_labels = self.num_labels
     model = ElectraForPreTraining(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
示例#14
0
    disc_pred = disc_logits > 0
    return gen_pred, generated, disc_pred, is_replaced

# %% [markdown]
# # 5. Train

# %%
# Generator and Discriminator
if c.my_model:
  gen_hparam['tie_in_out_embedding'] = c.tie_gen_in_out_embedding
  generator = ModelForGenerator(gen_hparam)
  discriminator = ModelForDiscriminator(disc_hparam)
  discriminator.electra.embedding = generator.electra.embedding
else:
  generator = ElectraForMaskedLM(gen_config)
  discriminator = ElectraForPreTraining(disc_config)
  discriminator.electra.embeddings = generator.electra.embeddings
  if c.tie_gen_in_out_embedding:
    generator.generator_predictions.dense.weight = generator.electra.embeddings.word_embeddings.weight

# ELECTRA training loop
electra_model = ELECTRAModel(generator, discriminator, hf_tokenizer)
electra_loss_func = ELECTRALoss(gen_label_smooth=c.gen_smooth_label, disc_label_smooth=c.disc_smooth_label)

# jit (Haven't fiqured out how to make it work)
# input_ids, sentA_lenths = dls.one_batch()
# masked_inputs, labels, is_mlm_applied = mlm_cb.mask_tokens(input_ids)
# electra_jit_model = torch.jit.trace(electra_model, (masked_inputs, sentA_lenths, is_mlm_applied, labels))

# Optimizer
if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01)
示例#15
0
    def __init__(
        self,
        model_type,
        model_name,
        generator_name=None,
        discriminator_name=None,
        train_files=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a LanguageModelingModel.

        Args:
            model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model.
            discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            train_files (optional): List of files to be used when training the tokenizer.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        if args and "manual_seed" in args:
            random.seed(args["manual_seed"])
            np.random.seed(args["manual_seed"])
            torch.manual_seed(args["manual_seed"])
            if "n_gpu" in args and args["n_gpu"] > 0:
                torch.cuda.manual_seed_all(args["manual_seed"])

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    " Make sure CUDA is available or set use_cuda=False."
                )
        else:
            self.device = "cpu"

        self.results = {}

        self.args = {
            "dataset_type": "None",
            "dataset_class": None,
            "custom_tokenizer": None,
            "block_size": -1,
            "mlm": True,
            "mlm_probability": 0.15,
            "max_steps": -1,
            "config_name": None,
            "tokenizer_name": None,
            "min_frequency": 2,
            "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
            "sliding_window": False,
            "stride": 0.8,
            "generator_config": {},
            "discriminator_config": {},
            "vocab_size": None,
        }

        self.args.update(global_args)

        if not use_cuda:
            self.args["fp16"] = False

        if args:
            self.args.update(args)

        self.args["model_name"] = model_name
        self.args["model_type"] = model_type

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer_class = tokenizer_class
        new_tokenizer = False

        if self.args["tokenizer_name"]:
            self.tokenizer = tokenizer_class.from_pretrained(
                self.args["tokenizer_name"], cache_dir=self.args["cache_dir"]
            )
        elif self.args["model_name"]:
            self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs)
            self.args["tokenizer_name"] = self.args["model_name"]
        else:
            if not train_files:
                raise ValueError(
                    "model_name and tokenizer_name are not specified."
                    "You must specify train_files to train a Tokenizer."
                )
            else:
                self.train_tokenizer(train_files)
                new_tokenizer = True

        if self.args["config_name"]:
            self.config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"])
        elif self.args["model_name"]:
            self.config = config_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs)
        else:
            self.config = config_class(**self.args["config"], **kwargs)
        if self.args["vocab_size"]:
            self.config.vocab_size = self.args["vocab_size"]
        if new_tokenizer:
            self.config.vocab_size = len(self.tokenizer)

        if self.args["model_type"] == "electra":
            if generator_name:
                self.generator_config = ElectraConfig.from_pretrained(generator_name)
            elif self.args["model_name"]:
                self.generator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args["model_name"], "generator_config"), **kwargs,
                )
            else:
                self.generator_config = ElectraConfig(**self.args["generator_config"], **kwargs)
                if new_tokenizer:
                    self.generator_config.vocab_size = len(self.tokenizer)

            if discriminator_name:
                self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name)
            elif self.args["model_name"]:
                self.discriminator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args["model_name"], "discriminator_config"), **kwargs,
                )
            else:
                self.discriminator_config = ElectraConfig(**self.args["discriminator_config"], **kwargs)
                if new_tokenizer:
                    self.discriminator_config.vocab_size = len(self.tokenizer)

        if self.args["block_size"] <= 0:
            self.args["block_size"] = min(self.args["max_seq_length"], self.tokenizer.max_len)
        else:
            self.args["block_size"] = min(self.args["block_size"], self.tokenizer.max_len, self.args["max_seq_length"])

        if self.args["model_name"]:
            if self.args["model_type"] == "electra":
                self.model = model_class.from_pretrained(
                    model_name,
                    config=self.config,
                    cache_dir=self.args["cache_dir"],
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                    **kwargs,
                )
                self.model.load_state_dict(torch.load(os.path.join(self.args["model_name"], "pytorch_model.bin")))
            else:
                self.model = model_class.from_pretrained(
                    model_name, config=self.config, cache_dir=self.args["cache_dir"], **kwargs,
                )
        else:
            logger.info(" Training language model from scratch")
            if self.args["model_type"] == "electra":
                generator_model = ElectraForMaskedLM(config=self.generator_config)
                discriminator_model = ElectraForPreTraining(config=self.discriminator_config)
                self.model = ElectraForLanguageModelingModel(
                    config=self.config,
                    generator_model=generator_model,
                    discriminator_model=discriminator_model,
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                )
                model_to_resize = (
                    self.model.generator_model.module
                    if hasattr(self.model.generator_model, "module")
                    else self.model.generator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = (
                    self.model.discriminator_model.module
                    if hasattr(self.model.discriminator_model, "module")
                    else self.model.discriminator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            else:
                self.model = model_class(config=self.config)
                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

        if model_type in ["camembert", "xlmroberta"]:
            warnings.warn(
                f"use_multiprocessing automatically disabled as {model_type}"
                " fails when using multiprocessing for feature conversion."
            )
            self.args["use_multiprocessing"] = False

        if self.args["wandb_project"] and not wandb_available:
            warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.")
            self.args["wandb_project"] = None
示例#16
0
import torch

from fio import read
from transformers import ElectraTokenizer, ElectraForPreTraining
"""
python ppl.py [hypothesis file]
"""

hyp_file = sys.argv[1]
assert os.path.exists(hyp_file)

device = torch.device("cuda:0")

tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-discriminator')
model = ElectraForPreTraining.from_pretrained(
    'google/electra-small-discriminator')
model.to(device)
model.eval()

sigmoid = torch.nn.Sigmoid()
pad_id = tokenizer.pad_token_id

src_tsf_paris = [line.split("\t")[:2] for line in read(hyp_file)]
src_tsf_paris = [(tokenizer.encode(src), tokenizer.encode(tsf))
                 for src, tsf in src_tsf_paris]


def format_samples(pairs):
    src_bat, tsf_bat = zip(*pairs)
    max_l_src, max_l_tsf = max([len(src) for src in src_bat
                                ]), max([len(tsf) for tsf in tsf_bat])
    def __init__(
        self,
        model_type,
        model_name,
        generator_name=None,
        discriminator_name=None,
        train_files=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a LanguageModelingModel.

        Args:
            model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model.
            discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            train_files (optional): List of files to be used when training the tokenizer.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, LanguageModelingArgs):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = {key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb"}
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if self.args.local_rank != -1:
            logger.info(f"local_rank: {self.args.local_rank}")
            torch.distributed.init_process_group(backend="nccl")
            cuda_device = self.args.local_rank

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    " Make sure CUDA is available or set use_cuda=False."
                )
        else:
            self.device = "cpu"

        self.results = {}

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_name = model_name
        self.args.model_type = model_type

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer_class = tokenizer_class
        new_tokenizer = False

        if self.args.tokenizer_name:
            self.tokenizer = tokenizer_class.from_pretrained(self.args.tokenizer_name, cache_dir=self.args.cache_dir)
        elif self.args.model_name:
            if self.args.model_name == "electra":
                self.tokenizer = tokenizer_class.from_pretrained(
                    generator_name, cache_dir=self.args.cache_dir, **kwargs
                )
                self.args.tokenizer_name = self.args.model_name
            else:
                self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs)
                self.args.tokenizer_name = self.args.model_name
        else:
            if not train_files:
                raise ValueError(
                    "model_name and tokenizer_name are not specified."
                    "You must specify train_files to train a Tokenizer."
                )
            else:
                self.train_tokenizer(train_files)
                new_tokenizer = True

        if self.args.config_name:
            self.config = config_class.from_pretrained(self.args.config_name, cache_dir=self.args.cache_dir)
        elif self.args.model_name and self.args.model_name != "electra":
            self.config = config_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs)
        else:
            self.config = config_class(**self.args.config, **kwargs)
        if self.args.vocab_size:
            self.config.vocab_size = self.args.vocab_size
        if new_tokenizer:
            self.config.vocab_size = len(self.tokenizer)

        if self.args.model_type == "electra":
            if generator_name:
                self.generator_config = ElectraConfig.from_pretrained(generator_name)
            elif self.args.model_name:
                self.generator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args.model_name, "generator_config"), **kwargs,
                )
            else:
                self.generator_config = ElectraConfig(**self.args.generator_config, **kwargs)
                if new_tokenizer:
                    self.generator_config.vocab_size = len(self.tokenizer)

            if discriminator_name:
                self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name)
            elif self.args.model_name:
                self.discriminator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args.model_name, "discriminator_config"), **kwargs,
                )
            else:
                self.discriminator_config = ElectraConfig(**self.args.discriminator_config, **kwargs)
                if new_tokenizer:
                    self.discriminator_config.vocab_size = len(self.tokenizer)

        if self.args.block_size <= 0:
            self.args.block_size = min(self.args.max_seq_length, self.tokenizer.max_len)
        else:
            self.args.block_size = min(self.args.block_size, self.tokenizer.max_len, self.args.max_seq_length)

        if self.args.model_name:
            if self.args.model_type == "electra":
                if self.args.model_name == "electra":
                    generator_model = ElectraForMaskedLM.from_pretrained(generator_name)
                    discriminator_model = ElectraForPreTraining.from_pretrained(discriminator_name)
                    self.model = ElectraForLanguageModelingModel(
                        config=self.config,
                        generator_model=generator_model,
                        discriminator_model=discriminator_model,
                        generator_config=self.generator_config,
                        discriminator_config=self.discriminator_config,
                        tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings,
                    )
                    model_to_resize = (
                        self.model.generator_model.module
                        if hasattr(self.model.generator_model, "module")
                        else self.model.generator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                    model_to_resize = (
                        self.model.discriminator_model.module
                        if hasattr(self.model.discriminator_model, "module")
                        else self.model.discriminator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))
                    self.model.generator_model = generator_model
                    self.model.discriminator_model = discriminator_model
                else:
                    self.model = model_class.from_pretrained(
                        model_name,
                        config=self.config,
                        cache_dir=self.args.cache_dir,
                        generator_config=self.generator_config,
                        discriminator_config=self.discriminator_config,
                        **kwargs,
                    )
                    self.model.load_state_dict(torch.load(os.path.join(self.args.model_name, "pytorch_model.bin")))
            else:
                self.model = model_class.from_pretrained(
                    model_name, config=self.config, cache_dir=self.args.cache_dir, **kwargs,
                )
        else:
            logger.info(" Training language model from scratch")
            if self.args.model_type == "electra":
                generator_model = ElectraForMaskedLM(config=self.generator_config)
                discriminator_model = ElectraForPreTraining(config=self.discriminator_config)
                self.model = ElectraForLanguageModelingModel(
                    config=self.config,
                    generator_model=generator_model,
                    discriminator_model=discriminator_model,
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                    tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings,
                )
                model_to_resize = (
                    self.model.generator_model.module
                    if hasattr(self.model.generator_model, "module")
                    else self.model.generator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = (
                    self.model.discriminator_model.module
                    if hasattr(self.model.discriminator_model, "module")
                    else self.model.discriminator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            else:
                self.model = model_class(config=self.config)
                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

        if model_type in ["camembert", "xlmroberta"]:
            warnings.warn(
                f"use_multiprocessing automatically disabled as {model_type}"
                " fails when using multiprocessing for feature conversion."
            )
            self.args.use_multiprocessing = False

        if self.args.wandb_project and not wandb_available:
            warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.")
            self.args.wandb_project = None
示例#18
0
def get_glue_learner(task, run_name=None, inference=False):

  # Num_epochs
  if task in ['rte', 'stsb']: num_epochs = 10
  else: num_epochs = 3

  # Dataloaders
  dls = glue_dls[task]
  if isinstance(c.device, str): dls.to(torch.device(c.device))
  elif isinstance(c.device, list): dls.to(torch.device('cuda', c.device[0]))
  else: dls.to(torch.device('cuda:0'))

  # Load pretrained model
  if not c.pretrained_checkpoint:
    discriminator = ElectraForPreTraining.from_pretrained(f"google/electra-{c.size}-discriminator")
  else:
    discriminator = ModelForDiscriminator(hparam) if c.my_model else ElectraForPreTraining(electra_config)
    load_part_model(c.pretrained_ckp_path, discriminator, 'discriminator')

  # Create finetuning model
  if task=='wnli' and c.wsc_trick: 
    model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id)
  else:
    model = SentencePredictor(discriminator.electra, electra_config.hidden_size, num_class=NUM_CLASS[task])

  # Discriminative learning rates
  splitter = partial( hf_electra_param_splitter, wsc_trick=(task=='wnli' and c.wsc_trick) )
  layer_lrs = get_layer_lrs(lr=c.lr, 
                            decay_rate=c.layer_lr_decay,
                            num_hidden_layers=electra_config.num_hidden_layers,)
  
  # Optimizer
  if c.adam_bias_correction: opt_func = partial(Adam, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01)
  else: opt_func = partial(Adam_no_bias_correction, eps=1e-6, mom=0.9, sqr_mom=0.999, wd=0.01)
  
  # Learner
  learn = Learner(dls, model,
                  loss_func=LOSS_FUNC[task], 
                  opt_func=opt_func,
                  metrics=[eval(f'{metric}()') for metric in METRICS[task]],
                  splitter=splitter if not inference else trainable_params,
                  lr=layer_lrs if not inference else defaults.lr,
                  path='./checkpoints',
                  model_dir='glue',)
  
  # Multi gpu
  if isinstance(c.device, list) or c.device is None:
    learn.model = nn.DataParallel(learn.model, device_ids=c.device)

  # Gradient clip
  learn.add_cb(GradientClipping(1.0))

  # Logging
  if run_name and not inference:
    neptune.create_experiment(name=run_name, params={'task':task, **c, **hparam_update})
    learn.add_cb(SimplerNeptuneCallback(False))

  # Learning rate schedule
  if c.schedule == 'one_cycle': 
    return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs)
  elif c.schedule == 'adjusted_one_cycle':
    return learn, partial(learn.fit_one_cycle, n_epoch=num_epochs, lr_max=layer_lrs, div=1e5, pct_start=0.1)
  else:
    lr_shed_func = linear_warmup_and_then_decay if c.schedule=='separate_linear' else linear_warmup_and_decay
    lr_shedule = ParamScheduler({'lr': partial(lr_shed_func,
                                               lr_max=np.array(layer_lrs),
                                               warmup_pct=0.1,
                                               total_steps=num_epochs*(len(dls.train)))})
    return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
示例#19
0
def get_glue_learner(task, run_name=None, inference=False):
    is_wsc_trick = task == "wnli" and c.wsc_trick

    # Num_epochs
    if task in ["rte", "stsb"]:
        num_epochs = 10
    else:
        num_epochs = 3

    # Dataloaders
    dls = glue_dls[task]
    if isinstance(c.device, str):
        dls.to(torch.device(c.device))
    elif isinstance(c.device, list):
        dls.to(torch.device("cuda", c.device[0]))
    else:
        dls.to(torch.device("cuda:0"))

    # Load pretrained model
    if not c.pretrained_checkpoint:
        discriminator = ElectraForPreTraining.from_pretrained(
            f"google/electra-{c.size}-discriminator")
    else:
        discriminator = (ModelForDiscriminator(hparam) if c.my_model else
                         ElectraForPreTraining(electra_config))
        load_part_model(c.pretrained_ckp_path, discriminator, "discriminator")

    # Seeds & PyTorch benchmark
    torch.backends.cudnn.benchmark = True
    if c.seeds:
        dls[0].rng = random.Random(c.seeds[i])  # for fastai dataloader
        random.seed(c.seeds[i])
        np.random.seed(c.seeds[i])
        torch.manual_seed(c.seeds[i])

    # Create finetuning model
    if is_wsc_trick:
        model = ELECTRAWSCTrickModel(discriminator, hf_tokenizer.pad_token_id)
    else:
        model = SentencePredictor(discriminator.electra,
                                  electra_config.hidden_size,
                                  num_class=NUM_CLASS[task])

    # Discriminative learning rates
    splitter = partial(hf_electra_param_splitter, wsc_trick=is_wsc_trick)
    layer_lrs = get_layer_lrs(
        lr=c.lr,
        decay_rate=c.layer_lr_decay,
        num_hidden_layers=electra_config.num_hidden_layers,
    )

    # Optimizer
    if c.adam_bias_correction:
        opt_func = partial(Adam,
                           eps=1e-6,
                           mom=0.9,
                           sqr_mom=0.999,
                           wd=c.weight_decay)
    else:
        opt_func = partial(Adam_no_bias_correction,
                           eps=1e-6,
                           mom=0.9,
                           sqr_mom=0.999,
                           wd=c.weight_decay)

    # Learner
    learn = Learner(
        dls,
        model,
        loss_func=LOSS_FUNC[task],
        opt_func=opt_func,
        metrics=METRICS[task],
        splitter=splitter if not inference else trainable_params,
        lr=layer_lrs if not inference else defaults.lr,
        path="./checkpoints/glue",
        model_dir=c.group_name,
    )

    # Multi gpu
    if isinstance(c.device, list) or c.device is None:
        learn.create_opt()
        learn.model = nn.DataParallel(learn.model, device_ids=c.device)

    # Mixed precision
    learn.to_native_fp16(init_scale=2.0**14)

    # Gradient clip
    learn.add_cb(GradientClipping(1.0))

    # Logging
    # Logging
    if run_name and not inference:
        if c.logger == "neptune":
            neptune.create_experiment(name=run_name,
                                      params={
                                          "task": task,
                                          **c,
                                          **hparam_update
                                      })
            learn.add_cb(LightNeptuneCallback(False))
        elif c.logger == "wandb":
            wandb_run = wandb.init(
                name=run_name,
                project="electra_glue",
                config={
                    "task": task,
                    **c,
                    **hparam_update
                },
                reinit=True,
            )
            learn.add_cb(LightWandbCallback(wandb_run))

    # Learning rate schedule
    if c.schedule == "one_cycle":
        return learn, partial(learn.fit_one_cycle,
                              n_epoch=num_epochs,
                              lr_max=layer_lrs)
    elif c.schedule == "adjusted_one_cycle":
        return learn, partial(
            learn.fit_one_cycle,
            n_epoch=num_epochs,
            lr_max=layer_lrs,
            div=1e5,
            pct_start=0.1,
        )
    else:
        lr_shed_func = (linear_warmup_and_then_decay if c.schedule
                        == "separate_linear" else linear_warmup_and_decay)
        lr_shedule = ParamScheduler({
            "lr":
            partial(
                lr_shed_func,
                lr_max=np.array(layer_lrs),
                warmup_pct=0.1,
                total_steps=num_epochs * (len(dls.train)),
            )
        })
        return learn, partial(learn.fit, n_epoch=num_epochs, cbs=[lr_shedule])
示例#20
0
def train(rank, args):

    #######################
    ## distributed

    if args.distributed_enabled:
        torch.distributed.init_process_group(
            backend='nccl',
            init_method='env://',
            world_size=args.distributed_world_size,
            rank=rank)
    if args.gpu_enabled:
        device = torch.device('cuda:{}'.format(rank))
    else:
        device = torch.device('cpu')

    is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0

    #######################
    ## preamble

    set_gpus(rank)
    set_seed(rank)
    set_cuda(deterministic=args.gpu_deterministic)

    output_dir = f'{args.output_dir}/{rank}'
    os.makedirs(output_dir, exist_ok=False)

    setup_logging(filename=f'{output_dir}/output.log', console=is_master)

    #######################
    ## dataset

    tokenizer = new_tokenizer(vocab_file=args.data_vocab_file)
    vocab_size = len(tokenizer.vocab)
    ds_train = wrap_example_builder(
        dataset=load_owt(owt_dir=args.data_dir,
                         n_tensors_per_file=args.data_n_tensors_per_file),
        vocab=tokenizer.vocab,
        max_length=args.data_max_seq_length)

    pad_token_id = tokenizer.vocab['[PAD]']
    mask_token_id = tokenizer.vocab['[MASK]']
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']

    assert pad_token_id == 0
    assert cls_token_id == 101
    assert sep_token_id == 102
    assert mask_token_id == 103

    def collate_batch(examples):
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [example['input_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        input_mask = torch.nn.utils.rnn.pad_sequence(
            [example['input_mask'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        segment_ids = torch.nn.utils.rnn.pad_sequence(
            [example['segment_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        return input_ids, input_mask, segment_ids

    def cycle(iterable):
        while True:
            for x in iterable:
                yield x

    ds_train_loader = iter(
        cycle(
            DataLoader(ds_train,
                       batch_size=args.opt_batch_size,
                       collate_fn=collate_batch)))

    #######################
    ## model

    def to_distributed_model(model):
        return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[rank], find_unused_parameters=True)

    def tie_weights(generator, discriminator):
        generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings
        generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings
        generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings

    class LogitsAdapter(torch.nn.Module):
        def __init__(self, adaptee):
            super().__init__()
            self.adaptee = adaptee

        def forward(self, *args, **kwargs):
            return self.adaptee(*args, **kwargs)[0]

    from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining

    generator = ElectraForMaskedLM(
        AutoConfig.from_pretrained(args.model_generator))
    discriminator = ElectraForPreTraining(
        AutoConfig.from_pretrained(args.model_discriminator))

    tie_weights(generator, discriminator)

    model = to_distributed_model(
        Electra(LogitsAdapter(generator),
                LogitsAdapter(discriminator),
                num_tokens=vocab_size,
                mask_token_id=mask_token_id,
                pad_token_id=pad_token_id,
                mask_prob=args.model_mask_prob,
                mask_ignore_token_ids=[
                    tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]']
                ],
                random_token_prob=0.0).to(device))

    #######################
    ## optimizer

    def get_linear_schedule_with_warmup(optimizer,
                                        num_warmup_steps,
                                        num_training_steps,
                                        last_epoch=-1):
        def lr_lambda(current_step):
            learning_rate = max(
                0.0, 1. - (float(current_step) / float(num_training_steps)))
            learning_rate *= min(1.0,
                                 float(current_step) / float(num_warmup_steps))
            return learning_rate

        return LambdaLR(optimizer, lr_lambda, last_epoch)

    def get_params_without_weight_decay_ln(named_params, weight_decay):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in named_params
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay,
            },
            {
                'params': [
                    p for n, p in named_params
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0,
            },
        ]
        return optimizer_grouped_parameters

    optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln(
        model.named_parameters(), weight_decay=0.1),
                                  lr=args.opt_lr,
                                  betas=(0.9, 0.999),
                                  eps=1e-08)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.opt_warmup_steps,
        num_training_steps=args.opt_num_training_steps)
    scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision)

    #######################
    ## train

    t, steps_s, eta_m = time(), 0., 0

    for step in range(args.opt_num_training_steps + 1):
        input_ids, input_mask, segment_ids = next(ds_train_loader)

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        assert input_ids.shape[1] <= args.data_max_seq_length

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision):
            loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=segment_ids)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        metrics = {
            'step': (step, '{:8d}'),
            'loss': (loss.item(), '{:8.5f}'),
            'loss_mlm': (loss_mlm.item(), '{:8.5f}'),
            'loss_disc': (loss_disc.item(), '{:8.5f}'),
            'acc_gen': (acc_gen.item(), '{:5.3f}'),
            'acc_disc': (acc_disc.item(), '{:5.3f}'),
            'lr': (scheduler.get_last_lr()[0], '{:8.7f}'),
            'steps': (steps_s, '{:4.1f}/s'),
            'eta': (eta_m, '{:4d}m'),
        }

        if step % args.step_log == 0:
            sep = ' ' * 2
            logger.info(
                sep.join([
                    f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items()
                ]))

        if step > 0 and step % 100 == 0:
            t2 = time()
            steps_s = 100. / (t2 - t)
            eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60)
            t = t2

        if step % 200 == 0:
            logger.info(
                np.array2string(disc_labels[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))
            logger.info(
                np.array2string(disc_pred[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))

        if step > 0 and step % args.step_ckpt == 0 and is_master:
            discriminator.electra.save_pretrained(
                f'{args.output_dir}/ckpt/{step}')