Пример #1
0
 def __init__(self, config, **kwargs):
     super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
     if "generator_config" in kwargs:
         generator_config = kwargs["generator_config"]
     else:
         generator_config = config
     self.generator_model = ElectraForMaskedLM(generator_config)
     if "discriminator_config" in kwargs:
         discriminator_config = kwargs["discriminator_config"]
     else:
         discriminator_config = config
     self.discriminator_model = ElectraForPreTraining(discriminator_config)
     self.vocab_size = config.vocab_size
     if kwargs.get("tie_generator_and_discriminator_embeddings", True):
         self.tie_generator_and_discriminator_embeddings()
Пример #2
0
def main(train_cfg='config/electra_pretrain.json',
         model_cfg='config/electra_small.json',
         data_file='../tbc/books_large_all.txt',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         save_dir='../exp/electra/pretrain',
         log_dir='../exp/electra/pretrain/runs',
         max_len=128,
         max_pred=20,
         mask_prob=0.15):

    check_dirs_exist([log_dir, save_dir])

    train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids, max_len)
    ]

    data_iter = SentPairDataLoader(data_file,
                                   train_cfg.batch_size,
                                   tokenize,
                                   max_len,
                                   pipeline=pipeline)

    # Get distilled-electra and quantized-distilled-electra
    generator = ElectraForMaskedLM.from_pretrained(
        'google/electra-small-generator')
    discriminator = ElectraForPreTraining.from_pretrained(
        'google/electra-small-discriminator')
    model = Electra(generator, discriminator)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer,
                         save_dir, get_device())
    trainer = ElectraTrainer(writer, *base_trainer_args)
    trainer.train(model_file, None, data_parallel)
Пример #3
0
    def __init__(self,
                 model_path='bert-base-uncased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 device='cuda'):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p)
        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        self.tokenizer = ElectraTokenizer.from_pretrained(model_path)
        self.model = ElectraForMaskedLM.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
Пример #4
0
 def __init__(self, config, output_size=100, extra_args=None, **kwargs):
     super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
     self.extra_args = extra_args
     if "generator_config" in kwargs:
         generator_config = kwargs["generator_config"]
     else:
         generator_config = config
     self.generator_model = ElectraForMaskedLM(generator_config)
     if "discriminator_config" in kwargs:
         discriminator_config = kwargs["discriminator_config"]
     else:
         discriminator_config = config
     self.discriminator_model = ElectraForPreTraining(discriminator_config, output_size=output_size, extra_args=self.extra_args)
     self.vocab_size = generator_config.vocab_size
     if kwargs.get("tie_generator_and_discriminator_embeddings", True):
         self.tie_generator_and_discriminator_embeddings()
     if "random_generator" in kwargs:
         self.random_generator = kwargs['random_generator']
         print(f'IN MODEL: RANDOM GENERATOR: {self.random_generator}')
Пример #5
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
    # Initialise PyTorch model
    config = ElectraConfig.from_json_file(config_file)
    print(f"Building PyTorch model from configuration: {config}")

    if discriminator_or_generator == "discriminator":
        model = ElectraForPreTraining(config)
    elif discriminator_or_generator == "generator":
        model = ElectraForMaskedLM(config)
    else:
        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")

    # Load weights from tf checkpoint
    load_tf_weights_in_electra(
        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
    )

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")
    torch.save(model.state_dict(), pytorch_dump_path)
 def create_and_check_electra_for_masked_lm(
     self,
     config,
     input_ids,
     token_type_ids,
     input_mask,
     sequence_labels,
     token_labels,
     choice_labels,
     fake_token_labels,
 ):
     model = ElectraForMaskedLM(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
     self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
Пример #7
0
def train(rank, args):

    #######################
    ## distributed

    if args.distributed_enabled:
        torch.distributed.init_process_group(
            backend='nccl',
            init_method='env://',
            world_size=args.distributed_world_size,
            rank=rank)
    if args.gpu_enabled:
        device = torch.device('cuda:{}'.format(rank))
    else:
        device = torch.device('cpu')

    is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0

    #######################
    ## preamble

    set_gpus(rank)
    set_seed(rank)
    set_cuda(deterministic=args.gpu_deterministic)

    output_dir = f'{args.output_dir}/{rank}'
    os.makedirs(output_dir, exist_ok=False)

    setup_logging(filename=f'{output_dir}/output.log', console=is_master)

    #######################
    ## dataset

    tokenizer = new_tokenizer(vocab_file=args.data_vocab_file)
    vocab_size = len(tokenizer.vocab)
    ds_train = wrap_example_builder(
        dataset=load_owt(owt_dir=args.data_dir,
                         n_tensors_per_file=args.data_n_tensors_per_file),
        vocab=tokenizer.vocab,
        max_length=args.data_max_seq_length)

    pad_token_id = tokenizer.vocab['[PAD]']
    mask_token_id = tokenizer.vocab['[MASK]']
    cls_token_id = tokenizer.vocab['[CLS]']
    sep_token_id = tokenizer.vocab['[SEP]']

    def collate_batch(examples):
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [example['input_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        input_mask = torch.nn.utils.rnn.pad_sequence(
            [example['input_mask'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        segment_ids = torch.nn.utils.rnn.pad_sequence(
            [example['segment_ids'] for example in examples],
            batch_first=True,
            padding_value=pad_token_id)
        return input_ids, input_mask, segment_ids

    def cycle(iterable):
        while True:
            for x in iterable:
                yield x

    ds_train_loader = iter(
        cycle(
            DataLoader(ds_train,
                       batch_size=args.opt_batch_size,
                       collate_fn=collate_batch)))

    #######################
    ## model

    def to_distributed_model(model):
        return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[rank], find_unused_parameters=True)

    def tie_weights(generator, discriminator):
        generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings
        generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings
        generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings

    class LogitsAdapter(torch.nn.Module):
        def __init__(self, adaptee):
            super().__init__()
            self.adaptee = adaptee

        def forward(self, *args, **kwargs):
            return self.adaptee(*args, **kwargs)[0]

    from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining

    generator = ElectraForMaskedLM(
        AutoConfig.from_pretrained(args.model_generator))
    discriminator = AdaptedDiscriminator(
        AlbertConfig.from_pretrained(args.model_discriminator))

    tie_weights(generator, discriminator)

    model = to_distributed_model(
        Electra(LogitsAdapter(generator),
                LogitsAdapter(discriminator),
                num_tokens=vocab_size,
                mask_token_id=mask_token_id,
                pad_token_id=pad_token_id,
                mask_prob=args.model_mask_prob,
                mask_ignore_token_ids=[
                    tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]']
                ],
                random_token_prob=0.0).to(device))

    #######################
    ## optimizer

    def get_linear_schedule_with_warmup(optimizer,
                                        num_warmup_steps,
                                        num_training_steps,
                                        last_epoch=-1):
        def lr_lambda(current_step):
            learning_rate = max(
                0.0, 1. - (float(current_step) / float(num_training_steps)))
            learning_rate *= min(1.0,
                                 float(current_step) / float(num_warmup_steps))
            return learning_rate

        return LambdaLR(optimizer, lr_lambda, last_epoch)

    def get_params_without_weight_decay_ln(named_params, weight_decay):
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in named_params
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                weight_decay,
            },
            {
                'params': [
                    p for n, p in named_params
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0,
            },
        ]
        return optimizer_grouped_parameters

    optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln(
        model.named_parameters(), weight_decay=0.1),
                                  lr=args.opt_lr,
                                  betas=(0.9, 0.999),
                                  eps=1e-08)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.opt_warmup_steps,
        num_training_steps=args.opt_num_training_steps)
    scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision)

    #######################
    ## train

    t, steps_s, eta_m = time(), 0., 0

    for step in range(args.opt_num_training_steps + 1):
        input_ids, input_mask, segment_ids = next(ds_train_loader)

        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        assert input_ids.shape[1] <= args.data_max_seq_length

        optimizer.zero_grad()

        with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision):
            loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=segment_ids)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        metrics = {
            'step': (step, '{:8d}'),
            'loss': (loss.item(), '{:8.5f}'),
            'loss_mlm': (loss_mlm.item(), '{:8.5f}'),
            'loss_disc': (loss_disc.item(), '{:8.5f}'),
            'acc_gen': (acc_gen.item(), '{:5.3f}'),
            'acc_disc': (acc_disc.item(), '{:5.3f}'),
            'lr': (scheduler.get_last_lr()[0], '{:8.7f}'),
            'steps': (steps_s, '{:4.1f}/s'),
            'eta': (eta_m, '{:4d}m'),
        }

        if step % args.step_log == 0:
            sep = ' ' * 2
            logger.info(
                sep.join([
                    f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items()
                ]))

        if step > 0 and step % 100 == 0:
            t2 = time()
            steps_s = 100. / (t2 - t)
            eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60)
            t = t2

        if step % 200 == 0:
            logger.info(
                np.array2string(disc_labels[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))
            logger.info(
                np.array2string(disc_pred[0].cpu().numpy(),
                                threshold=sys.maxsize,
                                max_line_width=sys.maxsize))

        if step > 0 and step % args.step_ckpt == 0 and is_master:
            discriminator.electra.save_pretrained(
                f'{args.output_dir}/ckpt/{step}')
Пример #8
0
    def __init__(
        self,
        model_type,
        model_name,
        generator_name=None,
        discriminator_name=None,
        train_files=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a LanguageModelingModel.

        Args:
            model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model.
            discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            train_files (optional): List of files to be used when training the tokenizer.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        if args and "manual_seed" in args:
            random.seed(args["manual_seed"])
            np.random.seed(args["manual_seed"])
            torch.manual_seed(args["manual_seed"])
            if "n_gpu" in args and args["n_gpu"] > 0:
                torch.cuda.manual_seed_all(args["manual_seed"])

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    " Make sure CUDA is available or set use_cuda=False."
                )
        else:
            self.device = "cpu"

        self.results = {}

        self.args = {
            "dataset_type": "None",
            "dataset_class": None,
            "custom_tokenizer": None,
            "block_size": -1,
            "mlm": True,
            "mlm_probability": 0.15,
            "max_steps": -1,
            "config_name": None,
            "tokenizer_name": None,
            "min_frequency": 2,
            "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>"],
            "sliding_window": False,
            "stride": 0.8,
            "generator_config": {},
            "discriminator_config": {},
            "vocab_size": None,
        }

        self.args.update(global_args)

        if not use_cuda:
            self.args["fp16"] = False

        if args:
            self.args.update(args)

        self.args["model_name"] = model_name
        self.args["model_type"] = model_type

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer_class = tokenizer_class
        new_tokenizer = False

        if self.args["tokenizer_name"]:
            self.tokenizer = tokenizer_class.from_pretrained(
                self.args["tokenizer_name"], cache_dir=self.args["cache_dir"]
            )
        elif self.args["model_name"]:
            self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs)
            self.args["tokenizer_name"] = self.args["model_name"]
        else:
            if not train_files:
                raise ValueError(
                    "model_name and tokenizer_name are not specified."
                    "You must specify train_files to train a Tokenizer."
                )
            else:
                self.train_tokenizer(train_files)
                new_tokenizer = True

        if self.args["config_name"]:
            self.config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"])
        elif self.args["model_name"]:
            self.config = config_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs)
        else:
            self.config = config_class(**self.args["config"], **kwargs)
        if self.args["vocab_size"]:
            self.config.vocab_size = self.args["vocab_size"]
        if new_tokenizer:
            self.config.vocab_size = len(self.tokenizer)

        if self.args["model_type"] == "electra":
            if generator_name:
                self.generator_config = ElectraConfig.from_pretrained(generator_name)
            elif self.args["model_name"]:
                self.generator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args["model_name"], "generator_config"), **kwargs,
                )
            else:
                self.generator_config = ElectraConfig(**self.args["generator_config"], **kwargs)
                if new_tokenizer:
                    self.generator_config.vocab_size = len(self.tokenizer)

            if discriminator_name:
                self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name)
            elif self.args["model_name"]:
                self.discriminator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args["model_name"], "discriminator_config"), **kwargs,
                )
            else:
                self.discriminator_config = ElectraConfig(**self.args["discriminator_config"], **kwargs)
                if new_tokenizer:
                    self.discriminator_config.vocab_size = len(self.tokenizer)

        if self.args["block_size"] <= 0:
            self.args["block_size"] = min(self.args["max_seq_length"], self.tokenizer.max_len)
        else:
            self.args["block_size"] = min(self.args["block_size"], self.tokenizer.max_len, self.args["max_seq_length"])

        if self.args["model_name"]:
            if self.args["model_type"] == "electra":
                self.model = model_class.from_pretrained(
                    model_name,
                    config=self.config,
                    cache_dir=self.args["cache_dir"],
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                    **kwargs,
                )
                self.model.load_state_dict(torch.load(os.path.join(self.args["model_name"], "pytorch_model.bin")))
            else:
                self.model = model_class.from_pretrained(
                    model_name, config=self.config, cache_dir=self.args["cache_dir"], **kwargs,
                )
        else:
            logger.info(" Training language model from scratch")
            if self.args["model_type"] == "electra":
                generator_model = ElectraForMaskedLM(config=self.generator_config)
                discriminator_model = ElectraForPreTraining(config=self.discriminator_config)
                self.model = ElectraForLanguageModelingModel(
                    config=self.config,
                    generator_model=generator_model,
                    discriminator_model=discriminator_model,
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                )
                model_to_resize = (
                    self.model.generator_model.module
                    if hasattr(self.model.generator_model, "module")
                    else self.model.generator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = (
                    self.model.discriminator_model.module
                    if hasattr(self.model.discriminator_model, "module")
                    else self.model.discriminator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            else:
                self.model = model_class(config=self.config)
                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

        if model_type in ["camembert", "xlmroberta"]:
            warnings.warn(
                f"use_multiprocessing automatically disabled as {model_type}"
                " fails when using multiprocessing for feature conversion."
            )
            self.args["use_multiprocessing"] = False

        if self.args["wandb_project"] and not wandb_available:
            warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.")
            self.args["wandb_project"] = None
Пример #9
0
class ElectraForLanguageModelingModel(PreTrainedModel):
    def __init__(self, config, **kwargs):
        super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs)
        if "generator_config" in kwargs:
            generator_config = kwargs["generator_config"]
        else:
            generator_config = config
        self.generator_model = ElectraForMaskedLM(generator_config)
        if "discriminator_config" in kwargs:
            discriminator_config = kwargs["discriminator_config"]
        else:
            discriminator_config = config
        self.discriminator_model = ElectraForPreTraining(discriminator_config)
        self.vocab_size = generator_config.vocab_size
        if kwargs.get("tie_generator_and_discriminator_embeddings", True):
            self.tie_generator_and_discriminator_embeddings()

    def tie_generator_and_discriminator_embeddings(self):
        self.discriminator_model.set_input_embeddings(
            self.generator_model.get_input_embeddings())

    def forward(self,
                inputs,
                masked_lm_labels,
                attention_mask=None,
                token_type_ids=None):
        d_inputs = inputs.clone()

        # run masked LM.
        g_out = self.generator_model(inputs,
                                     masked_lm_labels=masked_lm_labels,
                                     attention_mask=attention_mask,
                                     token_type_ids=token_type_ids)

        # get samples from masked LM.
        sample_probs = torch.softmax(g_out[1], dim=-1, dtype=torch.float32)
        sample_probs = sample_probs.view(-1, self.vocab_size)

        sampled_tokens = torch.multinomial(sample_probs, 1).view(-1)
        sampled_tokens = sampled_tokens.view(d_inputs.shape[0], -1)

        # labels have a -100 value to mask out loss from unchanged tokens.
        mask = masked_lm_labels.ne(-100)

        # replace the masked out tokens of the input with the generator predictions.
        d_inputs[mask] = sampled_tokens[mask]

        # turn mask into new target labels.  1 (True) for corrupted, 0 otherwise.
        # if the prediction was correct, mark it as uncorrupted.
        correct_preds = sampled_tokens == masked_lm_labels
        d_labels = mask.long()
        d_labels[correct_preds] = 0

        # run token classification, predict whether each token was corrupted.
        d_out = self.discriminator_model(d_inputs,
                                         labels=d_labels,
                                         attention_mask=attention_mask,
                                         token_type_ids=token_type_ids)

        g_loss = g_out[0]
        d_loss = d_out[0]
        g_scores = g_out[1]
        d_scores = d_out[1]
        return g_loss, d_loss, g_scores, d_scores, d_labels
Пример #10
0
 # eval_dataset = GenNLPMaskedDataset(train_batch_paths[-1:], tokenizer, seed=seed, masked_by_flag=True, only_input=True)
 ## test data
 test_batch_paths = test_region_paths[i]
 test_dataset = GenNLPMaskedDataset(test_batch_paths,
                                    tokenizer,
                                    seed=seed,
                                    masked_by_flag=True,
                                    only_input=True)
 ## model
 modeling_args.vocab_size = tokenizer.vocab_size
 if mode == 'pretrain':
     modeling_args.max_position_embeddings = 1300
 else:
     modeling_args.max_position_embeddings = train_dataset.max_position_embeddings(
     )
 electra_model = ElectraForMaskedLM(modeling_args)
 if os.path.isdir(prevert_path):
     electra_model = ElectraForMaskedLM.from_pretrained(prevert_path)
 trainer = OTrainer(
     model=electra_model,
     args=training_args,
     train_dataset=train_dataset,
     eval_dataset=test_dataset,
     compute_metrics=r2_score_transformers,
 )
 trainer.train()
 trainer.save_model(save_path)
 output_test = trainer.predict(test_dataset)
 metrics = output_test.metrics
 test_result_path = os.path.join(save_path, 'test_result.json')
 with g.writing(test_result_path) as trf:
Пример #11
0
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval()

xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()

bart_tokenizer = BartTokenizer.from_pretrained('bart-large')
bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval()

electra_tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained(
    'google/electra-small-generator').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()

top_k = 10


def decode(tokenizer, pred_idx, top_clean):
    ignore_tokens = string.punctuation + '[PAD]'
    tokens = []
    for w in pred_idx:
        token = ''.join(tokenizer.decode(w).split())
        if token not in ignore_tokens:
            tokens.append(token.replace('##', ''))
    return '\n'.join(tokens[:top_clean])
Пример #12
0
def load_ELECTRAsmall(ELECTRA_PATH):
    ELECTRAmodel = ElectraForMaskedLM.from_pretrained(ELECTRA_PATH)
    ELECTRAtokenizer = ElectraTokenizer.from_pretrained(ELECTRA_PATH)
    return ELECTRAmodel, ELECTRAtokenizer
Пример #13
0
    gen_pred = gen_logits.argmax(dim=-1)
    disc_pred = disc_logits > 0
    return gen_pred, generated, disc_pred, is_replaced

# %% [markdown]
# # 5. Train

# %%
# Generator and Discriminator
if c.my_model:
  gen_hparam['tie_in_out_embedding'] = c.tie_gen_in_out_embedding
  generator = ModelForGenerator(gen_hparam)
  discriminator = ModelForDiscriminator(disc_hparam)
  discriminator.electra.embedding = generator.electra.embedding
else:
  generator = ElectraForMaskedLM(gen_config)
  discriminator = ElectraForPreTraining(disc_config)
  discriminator.electra.embeddings = generator.electra.embeddings
  if c.tie_gen_in_out_embedding:
    generator.generator_predictions.dense.weight = generator.electra.embeddings.word_embeddings.weight

# ELECTRA training loop
electra_model = ELECTRAModel(generator, discriminator, hf_tokenizer)
electra_loss_func = ELECTRALoss(gen_label_smooth=c.gen_smooth_label, disc_label_smooth=c.disc_smooth_label)

# jit (Haven't fiqured out how to make it work)
# input_ids, sentA_lenths = dls.one_batch()
# masked_inputs, labels, is_mlm_applied = mlm_cb.mask_tokens(input_ids)
# electra_jit_model = torch.jit.trace(electra_model, (masked_inputs, sentA_lenths, is_mlm_applied, labels))

# Optimizer
Пример #14
0
                                             skip_first=True)

# datasets
model_name = config.bert_model_name

tokenizer = RobertaTokenizer.from_pretrained(config.bert_model_name)

cur_swap_prob = 0.
max_swap_prob = 0.

if max_swap_prob == 0:
    wordswap_tokenizer = wordswap_model = None
else:
    wordswap_tokenizer = ElectraTokenizer.from_pretrained(
        'google/electra-small-generator')
    wordswap_model = ElectraForMaskedLM.from_pretrained(
        'google/electra-small-generator', return_dict=True).cuda()

if config.get("split_by_doc_lens"):
    sent_lens = config.get("sent_lens")

train_set = IEDataset(config.file_dir + config.train_file, config, word_vocab,
                      wordswap_tokenizer, wordswap_model)
dev_set = IEDataset(config.file_dir + config.dev_file, config, word_vocab,
                    wordswap_tokenizer, wordswap_model)
if config.get("split_by_doc_lens"):
    test_sets = []
    for i in range(1, len(sent_lens)):
        max_len = sent_lens[i]
        min_len = sent_lens[i - 1]
        test_sets.append(
            IEDataset(config.file_dir + config.test_file,
def main():
    args = get_args()
    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()
    set_seed(args)

    model = ElectraForMaskedLM.from_pretrained(
        "monologg/koelectra-base-v2-discriminator").to(args.device)
    tokenizer = ElectraTokenizer.from_pretrained(
        "monologg/koelectra-base-v2-discriminator")
    print("get tokenizer, model success")
    '''
    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    '''
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    ##tokenizer = CharTokenizer([])

    bind_nsml(model, tokenizer, args)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode == "train" or args.mode == "pretrain":
        if args.mode == "train":
            noisy_sents = read_strings(
                os.path.join(args.data_dir, "train_data", "train_data"))
            sents_annotation = read_strings(
                os.path.join(args.data_dir, "train_data", "train_annotation"))
            clean_sents = read_strings(
                os.path.join(args.data_dir, "train_label"))

        if args.mode == "pretrain":
            print("PRETRAIN MODE ON!!")
            checkpoint = 'generated_data'
            sess = 't0005/rush1-1/113'
            noisy_sents, clean_sents = load_generated_data(
                checkpoint=checkpoint, session=sess)
            sents_annotation = ['None'] * len(noisy_sents)

        error_type_counter = Counter()

        for annotation in sents_annotation:
            error_type_counter += Counter(annotation.split(','))

        print(error_type_counter)

        # cleaning noise 버전
        # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
        # original 버전
        pairs = [{
            "noisy": noisy,
            "clean": clean,
            "annotation": annot
        } for noisy, clean, annot in zip(noisy_sents, clean_sents,
                                         sents_annotation)]
        #print("error? 1")
        train_data, valid_data = pairs[:-args.num_val_data], pairs[
            -args.num_val_data:]
        logger.info(f"# of train data: {len(train_data)}")
        logger.info(f"# of valid data: {len(valid_data)}")
        #print("error? 2")

        #train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data]
        #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size)
        bind_nsml(model, tokenizer, args)

        ## to load pretrained model
        #nsml.load(checkpoint='best', session='t0005/rush1-1/177')

        #print("error? 3")

    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, dim=1)

    if args.mode == "train" or args.mode == "pretrain":
        train(model, tokenizer, train_data, valid_data, args)
    def __init__(
        self,
        model_type,
        model_name,
        generator_name=None,
        discriminator_name=None,
        train_files=None,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a LanguageModelingModel.

        Args:
            model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert)
            model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin).
            generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model.
            discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            train_files (optional): List of files to be used when training the tokenizer.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, LanguageModelingArgs):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = {key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb"}
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if self.args.local_rank != -1:
            logger.info(f"local_rank: {self.args.local_rank}")
            torch.distributed.init_process_group(backend="nccl")
            cuda_device = self.args.local_rank

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    " Make sure CUDA is available or set use_cuda=False."
                )
        else:
            self.device = "cpu"

        self.results = {}

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_name = model_name
        self.args.model_type = model_type

        config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]
        self.tokenizer_class = tokenizer_class
        new_tokenizer = False

        if self.args.tokenizer_name:
            self.tokenizer = tokenizer_class.from_pretrained(self.args.tokenizer_name, cache_dir=self.args.cache_dir)
        elif self.args.model_name:
            if self.args.model_name == "electra":
                self.tokenizer = tokenizer_class.from_pretrained(
                    generator_name, cache_dir=self.args.cache_dir, **kwargs
                )
                self.args.tokenizer_name = self.args.model_name
            else:
                self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs)
                self.args.tokenizer_name = self.args.model_name
        else:
            if not train_files:
                raise ValueError(
                    "model_name and tokenizer_name are not specified."
                    "You must specify train_files to train a Tokenizer."
                )
            else:
                self.train_tokenizer(train_files)
                new_tokenizer = True

        if self.args.config_name:
            self.config = config_class.from_pretrained(self.args.config_name, cache_dir=self.args.cache_dir)
        elif self.args.model_name and self.args.model_name != "electra":
            self.config = config_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs)
        else:
            self.config = config_class(**self.args.config, **kwargs)
        if self.args.vocab_size:
            self.config.vocab_size = self.args.vocab_size
        if new_tokenizer:
            self.config.vocab_size = len(self.tokenizer)

        if self.args.model_type == "electra":
            if generator_name:
                self.generator_config = ElectraConfig.from_pretrained(generator_name)
            elif self.args.model_name:
                self.generator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args.model_name, "generator_config"), **kwargs,
                )
            else:
                self.generator_config = ElectraConfig(**self.args.generator_config, **kwargs)
                if new_tokenizer:
                    self.generator_config.vocab_size = len(self.tokenizer)

            if discriminator_name:
                self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name)
            elif self.args.model_name:
                self.discriminator_config = ElectraConfig.from_pretrained(
                    os.path.join(self.args.model_name, "discriminator_config"), **kwargs,
                )
            else:
                self.discriminator_config = ElectraConfig(**self.args.discriminator_config, **kwargs)
                if new_tokenizer:
                    self.discriminator_config.vocab_size = len(self.tokenizer)

        if self.args.block_size <= 0:
            self.args.block_size = min(self.args.max_seq_length, self.tokenizer.max_len)
        else:
            self.args.block_size = min(self.args.block_size, self.tokenizer.max_len, self.args.max_seq_length)

        if self.args.model_name:
            if self.args.model_type == "electra":
                if self.args.model_name == "electra":
                    generator_model = ElectraForMaskedLM.from_pretrained(generator_name)
                    discriminator_model = ElectraForPreTraining.from_pretrained(discriminator_name)
                    self.model = ElectraForLanguageModelingModel(
                        config=self.config,
                        generator_model=generator_model,
                        discriminator_model=discriminator_model,
                        generator_config=self.generator_config,
                        discriminator_config=self.discriminator_config,
                        tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings,
                    )
                    model_to_resize = (
                        self.model.generator_model.module
                        if hasattr(self.model.generator_model, "module")
                        else self.model.generator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))

                    model_to_resize = (
                        self.model.discriminator_model.module
                        if hasattr(self.model.discriminator_model, "module")
                        else self.model.discriminator_model
                    )
                    model_to_resize.resize_token_embeddings(len(self.tokenizer))
                    self.model.generator_model = generator_model
                    self.model.discriminator_model = discriminator_model
                else:
                    self.model = model_class.from_pretrained(
                        model_name,
                        config=self.config,
                        cache_dir=self.args.cache_dir,
                        generator_config=self.generator_config,
                        discriminator_config=self.discriminator_config,
                        **kwargs,
                    )
                    self.model.load_state_dict(torch.load(os.path.join(self.args.model_name, "pytorch_model.bin")))
            else:
                self.model = model_class.from_pretrained(
                    model_name, config=self.config, cache_dir=self.args.cache_dir, **kwargs,
                )
        else:
            logger.info(" Training language model from scratch")
            if self.args.model_type == "electra":
                generator_model = ElectraForMaskedLM(config=self.generator_config)
                discriminator_model = ElectraForPreTraining(config=self.discriminator_config)
                self.model = ElectraForLanguageModelingModel(
                    config=self.config,
                    generator_model=generator_model,
                    discriminator_model=discriminator_model,
                    generator_config=self.generator_config,
                    discriminator_config=self.discriminator_config,
                    tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings,
                )
                model_to_resize = (
                    self.model.generator_model.module
                    if hasattr(self.model.generator_model, "module")
                    else self.model.generator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

                model_to_resize = (
                    self.model.discriminator_model.module
                    if hasattr(self.model.discriminator_model, "module")
                    else self.model.discriminator_model
                )
                model_to_resize.resize_token_embeddings(len(self.tokenizer))
            else:
                self.model = model_class(config=self.config)
                model_to_resize = self.model.module if hasattr(self.model, "module") else self.model
                model_to_resize.resize_token_embeddings(len(self.tokenizer))

        if model_type in ["camembert", "xlmroberta"]:
            warnings.warn(
                f"use_multiprocessing automatically disabled as {model_type}"
                " fails when using multiprocessing for feature conversion."
            )
            self.args.use_multiprocessing = False

        if self.args.wandb_project and not wandb_available:
            warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.")
            self.args.wandb_project = None
Пример #17
0
print(tokenized_text)

# Convert token to vocabulary indices
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
token_type_ids = [0] * len(token_ids)

print(token_ids)
print(token_type_ids) # segment_ids

# Convert inputs to PyTorch tensors
token_ids_tensor = torch.tensor([token_ids]).to('cuda')
token_type_ids_tensor = torch.tensor([token_type_ids]).to('cuda')

## 2
# Load pre-trained model (weights)
model = ElectraForMaskedLM.from_pretrained('monologg/koelectra-base-discriminator')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
model.to('cuda')

## 3
# Predict all tokens
with torch.no_grad():
    outputs = model(token_ids_tensor, token_type_ids=token_type_ids_tensor)
    predictions = outputs[0]
print(predictions)

predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
Пример #18
0
from transformers import ElectraForMaskedLM, ElectraTokenizer

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Unsupervised training')
    parser.add_argument("--model_path",
                        type=str,
                        default="",
                        help="Pretrained Electra Model Path")
    parser.add_argument("--emb_out_path",
                        type=str,
                        default="",
                        help="Electra Embedding Output Path")
    args = parser.parse_args()

    electra_model = ElectraForMaskedLM.from_pretrained(args.model_path)
    tokenizer = ElectraTokenizer.from_pretrained(args.model_path)

    vocab_size, emb_dim = electra_model.electra.embeddings.word_embeddings.weight.size(
    )

    assert vocab_size == tokenizer.vocab_size

    vectors = torch.zeros((vocab_size, emb_dim), dtype=torch.float32)

    dico = []
    for idx in range(vocab_size):
        token = tokenizer.ids_to_tokens.get(idx)
        assert token is not None
        dico.append(token)
        vectors[idx] = electra_model.electra.embeddings.word_embeddings.weight[