def __init__(self, config, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config) self.vocab_size = config.vocab_size if kwargs.get("tie_generator_and_discriminator_embeddings", True): self.tie_generator_and_discriminator_embeddings()
def main(train_cfg='config/electra_pretrain.json', model_cfg='config/electra_small.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', save_dir='../exp/electra/pretrain', log_dir='../exp/electra/pretrain/runs', max_len=128, max_pred=20, mask_prob=0.15): check_dirs_exist([log_dir, save_dir]) train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) # Get distilled-electra and quantized-distilled-electra generator = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator') discriminator = ElectraForPreTraining.from_pretrained( 'google/electra-small-discriminator') model = Electra(generator, discriminator) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device()) trainer = ElectraTrainer(writer, *base_trainer_args) trainer.train(model_file, None, data_parallel)
def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) self.model_path = model_path # self.tokenizer = AutoTokenizer.from_pretrained(model_path) # self.model = AutoModel.from_pretrained(model_path) self.tokenizer = ElectraTokenizer.from_pretrained(model_path) self.model = ElectraForMaskedLM.from_pretrained(model_path) self.model.to(self.device) self.model.eval()
def __init__(self, config, output_size=100, extra_args=None, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) self.extra_args = extra_args if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config, output_size=output_size, extra_args=self.extra_args) self.vocab_size = generator_config.vocab_size if kwargs.get("tie_generator_and_discriminator_embeddings", True): self.tie_generator_and_discriminator_embeddings() if "random_generator" in kwargs: self.random_generator = kwargs['random_generator'] print(f'IN MODEL: RANDOM GENERATOR: {self.random_generator}')
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator): # Initialise PyTorch model config = ElectraConfig.from_json_file(config_file) print(f"Building PyTorch model from configuration: {config}") if discriminator_or_generator == "discriminator": model = ElectraForPreTraining(config) elif discriminator_or_generator == "generator": model = ElectraForMaskedLM(config) else: raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'") # Load weights from tf checkpoint load_tf_weights_in_electra( model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator ) # Save pytorch-model print(f"Save PyTorch model to {pytorch_dump_path}") torch.save(model.state_dict(), pytorch_dump_path)
def create_and_check_electra_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, fake_token_labels, ): model = ElectraForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def train(rank, args): ####################### ## distributed if args.distributed_enabled: torch.distributed.init_process_group( backend='nccl', init_method='env://', world_size=args.distributed_world_size, rank=rank) if args.gpu_enabled: device = torch.device('cuda:{}'.format(rank)) else: device = torch.device('cpu') is_master = True if not args.distributed_enabled else args.distributed_enabled and rank == 0 ####################### ## preamble set_gpus(rank) set_seed(rank) set_cuda(deterministic=args.gpu_deterministic) output_dir = f'{args.output_dir}/{rank}' os.makedirs(output_dir, exist_ok=False) setup_logging(filename=f'{output_dir}/output.log', console=is_master) ####################### ## dataset tokenizer = new_tokenizer(vocab_file=args.data_vocab_file) vocab_size = len(tokenizer.vocab) ds_train = wrap_example_builder( dataset=load_owt(owt_dir=args.data_dir, n_tensors_per_file=args.data_n_tensors_per_file), vocab=tokenizer.vocab, max_length=args.data_max_seq_length) pad_token_id = tokenizer.vocab['[PAD]'] mask_token_id = tokenizer.vocab['[MASK]'] cls_token_id = tokenizer.vocab['[CLS]'] sep_token_id = tokenizer.vocab['[SEP]'] def collate_batch(examples): input_ids = torch.nn.utils.rnn.pad_sequence( [example['input_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) input_mask = torch.nn.utils.rnn.pad_sequence( [example['input_mask'] for example in examples], batch_first=True, padding_value=pad_token_id) segment_ids = torch.nn.utils.rnn.pad_sequence( [example['segment_ids'] for example in examples], batch_first=True, padding_value=pad_token_id) return input_ids, input_mask, segment_ids def cycle(iterable): while True: for x in iterable: yield x ds_train_loader = iter( cycle( DataLoader(ds_train, batch_size=args.opt_batch_size, collate_fn=collate_batch))) ####################### ## model def to_distributed_model(model): return model if not args.distributed_enabled else torch.nn.parallel.DistributedDataParallel( model, device_ids=[rank], find_unused_parameters=True) def tie_weights(generator, discriminator): generator.electra.embeddings.word_embeddings = discriminator.electra.embeddings.word_embeddings generator.electra.embeddings.position_embeddings = discriminator.electra.embeddings.position_embeddings generator.electra.embeddings.token_type_embeddings = discriminator.electra.embeddings.token_type_embeddings class LogitsAdapter(torch.nn.Module): def __init__(self, adaptee): super().__init__() self.adaptee = adaptee def forward(self, *args, **kwargs): return self.adaptee(*args, **kwargs)[0] from transformers import AutoConfig, ElectraForMaskedLM, ElectraForPreTraining generator = ElectraForMaskedLM( AutoConfig.from_pretrained(args.model_generator)) discriminator = AdaptedDiscriminator( AlbertConfig.from_pretrained(args.model_discriminator)) tie_weights(generator, discriminator) model = to_distributed_model( Electra(LogitsAdapter(generator), LogitsAdapter(discriminator), num_tokens=vocab_size, mask_token_id=mask_token_id, pad_token_id=pad_token_id, mask_prob=args.model_mask_prob, mask_ignore_token_ids=[ tokenizer.vocab['[CLS]'], tokenizer.vocab['[SEP]'] ], random_token_prob=0.0).to(device)) ####################### ## optimizer def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): def lr_lambda(current_step): learning_rate = max( 0.0, 1. - (float(current_step) / float(num_training_steps))) learning_rate *= min(1.0, float(current_step) / float(num_warmup_steps)) return learning_rate return LambdaLR(optimizer, lr_lambda, last_epoch) def get_params_without_weight_decay_ln(named_params, weight_decay): no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in named_params if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay, }, { 'params': [ p for n, p in named_params if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, }, ] return optimizer_grouped_parameters optimizer = torch.optim.AdamW(get_params_without_weight_decay_ln( model.named_parameters(), weight_decay=0.1), lr=args.opt_lr, betas=(0.9, 0.999), eps=1e-08) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.opt_warmup_steps, num_training_steps=args.opt_num_training_steps) scaler = torch.cuda.amp.GradScaler(enabled=args.gpu_mixed_precision) ####################### ## train t, steps_s, eta_m = time(), 0., 0 for step in range(args.opt_num_training_steps + 1): input_ids, input_mask, segment_ids = next(ds_train_loader) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) assert input_ids.shape[1] <= args.data_max_seq_length optimizer.zero_grad() with torch.cuda.amp.autocast(enabled=args.gpu_mixed_precision): loss, loss_mlm, loss_disc, acc_gen, acc_disc, disc_labels, disc_pred = model( input_ids, attention_mask=input_mask, token_type_ids=segment_ids) scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scaler.step(optimizer) scaler.update() scheduler.step() metrics = { 'step': (step, '{:8d}'), 'loss': (loss.item(), '{:8.5f}'), 'loss_mlm': (loss_mlm.item(), '{:8.5f}'), 'loss_disc': (loss_disc.item(), '{:8.5f}'), 'acc_gen': (acc_gen.item(), '{:5.3f}'), 'acc_disc': (acc_disc.item(), '{:5.3f}'), 'lr': (scheduler.get_last_lr()[0], '{:8.7f}'), 'steps': (steps_s, '{:4.1f}/s'), 'eta': (eta_m, '{:4d}m'), } if step % args.step_log == 0: sep = ' ' * 2 logger.info( sep.join([ f'{k}: {v[1].format(v[0])}' for (k, v) in metrics.items() ])) if step > 0 and step % 100 == 0: t2 = time() steps_s = 100. / (t2 - t) eta_m = int(((args.opt_num_training_steps - step) / steps_s) // 60) t = t2 if step % 200 == 0: logger.info( np.array2string(disc_labels[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) logger.info( np.array2string(disc_pred[0].cpu().numpy(), threshold=sys.maxsize, max_line_width=sys.maxsize)) if step > 0 and step % args.step_ckpt == 0 and is_master: discriminator.electra.save_pretrained( f'{args.output_dir}/ckpt/{step}')
def __init__( self, model_type, model_name, generator_name=None, discriminator_name=None, train_files=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a LanguageModelingModel. Args: model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert) model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model. discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. train_files (optional): List of files to be used when training the tokenizer. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" if args and "manual_seed" in args: random.seed(args["manual_seed"]) np.random.seed(args["manual_seed"]) torch.manual_seed(args["manual_seed"]) if "n_gpu" in args and args["n_gpu"] > 0: torch.cuda.manual_seed_all(args["manual_seed"]) if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." " Make sure CUDA is available or set use_cuda=False." ) else: self.device = "cpu" self.results = {} self.args = { "dataset_type": "None", "dataset_class": None, "custom_tokenizer": None, "block_size": -1, "mlm": True, "mlm_probability": 0.15, "max_steps": -1, "config_name": None, "tokenizer_name": None, "min_frequency": 2, "special_tokens": ["<s>", "<pad>", "</s>", "<unk>", "<mask>"], "sliding_window": False, "stride": 0.8, "generator_config": {}, "discriminator_config": {}, "vocab_size": None, } self.args.update(global_args) if not use_cuda: self.args["fp16"] = False if args: self.args.update(args) self.args["model_name"] = model_name self.args["model_type"] = model_type config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] self.tokenizer_class = tokenizer_class new_tokenizer = False if self.args["tokenizer_name"]: self.tokenizer = tokenizer_class.from_pretrained( self.args["tokenizer_name"], cache_dir=self.args["cache_dir"] ) elif self.args["model_name"]: self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs) self.args["tokenizer_name"] = self.args["model_name"] else: if not train_files: raise ValueError( "model_name and tokenizer_name are not specified." "You must specify train_files to train a Tokenizer." ) else: self.train_tokenizer(train_files) new_tokenizer = True if self.args["config_name"]: self.config = config_class.from_pretrained(self.args["config_name"], cache_dir=self.args["cache_dir"]) elif self.args["model_name"]: self.config = config_class.from_pretrained(model_name, cache_dir=self.args["cache_dir"], **kwargs) else: self.config = config_class(**self.args["config"], **kwargs) if self.args["vocab_size"]: self.config.vocab_size = self.args["vocab_size"] if new_tokenizer: self.config.vocab_size = len(self.tokenizer) if self.args["model_type"] == "electra": if generator_name: self.generator_config = ElectraConfig.from_pretrained(generator_name) elif self.args["model_name"]: self.generator_config = ElectraConfig.from_pretrained( os.path.join(self.args["model_name"], "generator_config"), **kwargs, ) else: self.generator_config = ElectraConfig(**self.args["generator_config"], **kwargs) if new_tokenizer: self.generator_config.vocab_size = len(self.tokenizer) if discriminator_name: self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name) elif self.args["model_name"]: self.discriminator_config = ElectraConfig.from_pretrained( os.path.join(self.args["model_name"], "discriminator_config"), **kwargs, ) else: self.discriminator_config = ElectraConfig(**self.args["discriminator_config"], **kwargs) if new_tokenizer: self.discriminator_config.vocab_size = len(self.tokenizer) if self.args["block_size"] <= 0: self.args["block_size"] = min(self.args["max_seq_length"], self.tokenizer.max_len) else: self.args["block_size"] = min(self.args["block_size"], self.tokenizer.max_len, self.args["max_seq_length"]) if self.args["model_name"]: if self.args["model_type"] == "electra": self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args["cache_dir"], generator_config=self.generator_config, discriminator_config=self.discriminator_config, **kwargs, ) self.model.load_state_dict(torch.load(os.path.join(self.args["model_name"], "pytorch_model.bin"))) else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args["cache_dir"], **kwargs, ) else: logger.info(" Training language model from scratch") if self.args["model_type"] == "electra": generator_model = ElectraForMaskedLM(config=self.generator_config) discriminator_model = ElectraForPreTraining(config=self.discriminator_config) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) else: self.model = model_class(config=self.config) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) if model_type in ["camembert", "xlmroberta"]: warnings.warn( f"use_multiprocessing automatically disabled as {model_type}" " fails when using multiprocessing for feature conversion." ) self.args["use_multiprocessing"] = False if self.args["wandb_project"] and not wandb_available: warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.") self.args["wandb_project"] = None
class ElectraForLanguageModelingModel(PreTrainedModel): def __init__(self, config, **kwargs): super(ElectraForLanguageModelingModel, self).__init__(config, **kwargs) if "generator_config" in kwargs: generator_config = kwargs["generator_config"] else: generator_config = config self.generator_model = ElectraForMaskedLM(generator_config) if "discriminator_config" in kwargs: discriminator_config = kwargs["discriminator_config"] else: discriminator_config = config self.discriminator_model = ElectraForPreTraining(discriminator_config) self.vocab_size = generator_config.vocab_size if kwargs.get("tie_generator_and_discriminator_embeddings", True): self.tie_generator_and_discriminator_embeddings() def tie_generator_and_discriminator_embeddings(self): self.discriminator_model.set_input_embeddings( self.generator_model.get_input_embeddings()) def forward(self, inputs, masked_lm_labels, attention_mask=None, token_type_ids=None): d_inputs = inputs.clone() # run masked LM. g_out = self.generator_model(inputs, masked_lm_labels=masked_lm_labels, attention_mask=attention_mask, token_type_ids=token_type_ids) # get samples from masked LM. sample_probs = torch.softmax(g_out[1], dim=-1, dtype=torch.float32) sample_probs = sample_probs.view(-1, self.vocab_size) sampled_tokens = torch.multinomial(sample_probs, 1).view(-1) sampled_tokens = sampled_tokens.view(d_inputs.shape[0], -1) # labels have a -100 value to mask out loss from unchanged tokens. mask = masked_lm_labels.ne(-100) # replace the masked out tokens of the input with the generator predictions. d_inputs[mask] = sampled_tokens[mask] # turn mask into new target labels. 1 (True) for corrupted, 0 otherwise. # if the prediction was correct, mark it as uncorrupted. correct_preds = sampled_tokens == masked_lm_labels d_labels = mask.long() d_labels[correct_preds] = 0 # run token classification, predict whether each token was corrupted. d_out = self.discriminator_model(d_inputs, labels=d_labels, attention_mask=attention_mask, token_type_ids=token_type_ids) g_loss = g_out[0] d_loss = d_out[0] g_scores = g_out[1] d_scores = d_out[1] return g_loss, d_loss, g_scores, d_scores, d_labels
# eval_dataset = GenNLPMaskedDataset(train_batch_paths[-1:], tokenizer, seed=seed, masked_by_flag=True, only_input=True) ## test data test_batch_paths = test_region_paths[i] test_dataset = GenNLPMaskedDataset(test_batch_paths, tokenizer, seed=seed, masked_by_flag=True, only_input=True) ## model modeling_args.vocab_size = tokenizer.vocab_size if mode == 'pretrain': modeling_args.max_position_embeddings = 1300 else: modeling_args.max_position_embeddings = train_dataset.max_position_embeddings( ) electra_model = ElectraForMaskedLM(modeling_args) if os.path.isdir(prevert_path): electra_model = ElectraForMaskedLM.from_pretrained(prevert_path) trainer = OTrainer( model=electra_model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, compute_metrics=r2_score_transformers, ) trainer.train() trainer.save_model(save_path) output_test = trainer.predict(test_dataset) metrics = output_test.metrics test_result_path = os.path.join(save_path, 'test_result.json') with g.writing(test_result_path) as trf:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval() xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval() xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() bart_tokenizer = BartTokenizer.from_pretrained('bart-large') bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval() electra_tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-generator') electra_model = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx: token = ''.join(tokenizer.decode(w).split()) if token not in ignore_tokens: tokens.append(token.replace('##', '')) return '\n'.join(tokens[:top_clean])
def load_ELECTRAsmall(ELECTRA_PATH): ELECTRAmodel = ElectraForMaskedLM.from_pretrained(ELECTRA_PATH) ELECTRAtokenizer = ElectraTokenizer.from_pretrained(ELECTRA_PATH) return ELECTRAmodel, ELECTRAtokenizer
gen_pred = gen_logits.argmax(dim=-1) disc_pred = disc_logits > 0 return gen_pred, generated, disc_pred, is_replaced # %% [markdown] # # 5. Train # %% # Generator and Discriminator if c.my_model: gen_hparam['tie_in_out_embedding'] = c.tie_gen_in_out_embedding generator = ModelForGenerator(gen_hparam) discriminator = ModelForDiscriminator(disc_hparam) discriminator.electra.embedding = generator.electra.embedding else: generator = ElectraForMaskedLM(gen_config) discriminator = ElectraForPreTraining(disc_config) discriminator.electra.embeddings = generator.electra.embeddings if c.tie_gen_in_out_embedding: generator.generator_predictions.dense.weight = generator.electra.embeddings.word_embeddings.weight # ELECTRA training loop electra_model = ELECTRAModel(generator, discriminator, hf_tokenizer) electra_loss_func = ELECTRALoss(gen_label_smooth=c.gen_smooth_label, disc_label_smooth=c.disc_smooth_label) # jit (Haven't fiqured out how to make it work) # input_ids, sentA_lenths = dls.one_batch() # masked_inputs, labels, is_mlm_applied = mlm_cb.mask_tokens(input_ids) # electra_jit_model = torch.jit.trace(electra_model, (masked_inputs, sentA_lenths, is_mlm_applied, labels)) # Optimizer
skip_first=True) # datasets model_name = config.bert_model_name tokenizer = RobertaTokenizer.from_pretrained(config.bert_model_name) cur_swap_prob = 0. max_swap_prob = 0. if max_swap_prob == 0: wordswap_tokenizer = wordswap_model = None else: wordswap_tokenizer = ElectraTokenizer.from_pretrained( 'google/electra-small-generator') wordswap_model = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator', return_dict=True).cuda() if config.get("split_by_doc_lens"): sent_lens = config.get("sent_lens") train_set = IEDataset(config.file_dir + config.train_file, config, word_vocab, wordswap_tokenizer, wordswap_model) dev_set = IEDataset(config.file_dir + config.dev_file, config, word_vocab, wordswap_tokenizer, wordswap_model) if config.get("split_by_doc_lens"): test_sets = [] for i in range(1, len(sent_lens)): max_len = sent_lens[i] min_len = sent_lens[i - 1] test_sets.append( IEDataset(config.file_dir + config.test_file,
def main(): args = get_args() logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}") args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() set_seed(args) model = ElectraForMaskedLM.from_pretrained( "monologg/koelectra-base-v2-discriminator").to(args.device) tokenizer = ElectraTokenizer.from_pretrained( "monologg/koelectra-base-v2-discriminator") print("get tokenizer, model success") ''' model = TransformerModel( vocab_size=args.vocab_size, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, num_encoder_layers=args.num_encoder_layers, num_decoder_layers=args.num_decoder_layers, intermediate_size=args.intermediate_size, dropout=args.dropout, ).to(args.device) ''' logger.info( f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M" ) ##tokenizer = CharTokenizer([]) bind_nsml(model, tokenizer, args) if args.pause: nsml.paused(scope=locals()) if args.mode == "train" or args.mode == "pretrain": if args.mode == "train": noisy_sents = read_strings( os.path.join(args.data_dir, "train_data", "train_data")) sents_annotation = read_strings( os.path.join(args.data_dir, "train_data", "train_annotation")) clean_sents = read_strings( os.path.join(args.data_dir, "train_label")) if args.mode == "pretrain": print("PRETRAIN MODE ON!!") checkpoint = 'generated_data' sess = 't0005/rush1-1/113' noisy_sents, clean_sents = load_generated_data( checkpoint=checkpoint, session=sess) sents_annotation = ['None'] * len(noisy_sents) error_type_counter = Counter() for annotation in sents_annotation: error_type_counter += Counter(annotation.split(',')) print(error_type_counter) # cleaning noise 버전 # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)] # original 버전 pairs = [{ "noisy": noisy, "clean": clean, "annotation": annot } for noisy, clean, annot in zip(noisy_sents, clean_sents, sents_annotation)] #print("error? 1") train_data, valid_data = pairs[:-args.num_val_data], pairs[ -args.num_val_data:] logger.info(f"# of train data: {len(train_data)}") logger.info(f"# of valid data: {len(valid_data)}") #print("error? 2") #train_sents = [x['noisy'] for x in train_data] + [x['clean'] for x in train_data] #tokenizer = CharTokenizer.from_strings(train_sents, args.vocab_size) bind_nsml(model, tokenizer, args) ## to load pretrained model #nsml.load(checkpoint='best', session='t0005/rush1-1/177') #print("error? 3") if args.n_gpu > 1: model = torch.nn.DataParallel(model, dim=1) if args.mode == "train" or args.mode == "pretrain": train(model, tokenizer, train_data, valid_data, args)
def __init__( self, model_type, model_name, generator_name=None, discriminator_name=None, train_files=None, args=None, use_cuda=True, cuda_device=-1, **kwargs, ): """ Initializes a LanguageModelingModel. Args: model_type: The type of model (gpt2, openai-gpt, bert, roberta, distilbert, camembert) model_name: Default Transformer model name or path to a directory containing Transformer model file (pytorch_nodel.bin). generator_name (optional): A pretrained model name or path to a directory containing an ELECTRA generator model. discriminator_name (optional): A pretrained model name or path to a directory containing an ELECTRA discriminator model. args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args. train_files (optional): List of files to be used when training the tokenizer. use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only. cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default. **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied. """ # noqa: ignore flake8" self.args = self._load_model_args(model_name) if isinstance(args, dict): self.args.update_from_dict(args) elif isinstance(args, LanguageModelingArgs): self.args = args if "sweep_config" in kwargs: sweep_config = kwargs.pop("sweep_config") sweep_values = {key: value["value"] for key, value in sweep_config.as_dict().items() if key != "_wandb"} self.args.update_from_dict(sweep_values) if self.args.manual_seed: random.seed(self.args.manual_seed) np.random.seed(self.args.manual_seed) torch.manual_seed(self.args.manual_seed) if self.args.n_gpu > 0: torch.cuda.manual_seed_all(self.args.manual_seed) if self.args.local_rank != -1: logger.info(f"local_rank: {self.args.local_rank}") torch.distributed.init_process_group(backend="nccl") cuda_device = self.args.local_rank if use_cuda: if torch.cuda.is_available(): if cuda_device == -1: self.device = torch.device("cuda") else: self.device = torch.device(f"cuda:{cuda_device}") else: raise ValueError( "'use_cuda' set to True when cuda is unavailable." " Make sure CUDA is available or set use_cuda=False." ) else: self.device = "cpu" self.results = {} if not use_cuda: self.args.fp16 = False self.args.model_name = model_name self.args.model_type = model_type config_class, model_class, tokenizer_class = MODEL_CLASSES[model_type] self.tokenizer_class = tokenizer_class new_tokenizer = False if self.args.tokenizer_name: self.tokenizer = tokenizer_class.from_pretrained(self.args.tokenizer_name, cache_dir=self.args.cache_dir) elif self.args.model_name: if self.args.model_name == "electra": self.tokenizer = tokenizer_class.from_pretrained( generator_name, cache_dir=self.args.cache_dir, **kwargs ) self.args.tokenizer_name = self.args.model_name else: self.tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) self.args.tokenizer_name = self.args.model_name else: if not train_files: raise ValueError( "model_name and tokenizer_name are not specified." "You must specify train_files to train a Tokenizer." ) else: self.train_tokenizer(train_files) new_tokenizer = True if self.args.config_name: self.config = config_class.from_pretrained(self.args.config_name, cache_dir=self.args.cache_dir) elif self.args.model_name and self.args.model_name != "electra": self.config = config_class.from_pretrained(model_name, cache_dir=self.args.cache_dir, **kwargs) else: self.config = config_class(**self.args.config, **kwargs) if self.args.vocab_size: self.config.vocab_size = self.args.vocab_size if new_tokenizer: self.config.vocab_size = len(self.tokenizer) if self.args.model_type == "electra": if generator_name: self.generator_config = ElectraConfig.from_pretrained(generator_name) elif self.args.model_name: self.generator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "generator_config"), **kwargs, ) else: self.generator_config = ElectraConfig(**self.args.generator_config, **kwargs) if new_tokenizer: self.generator_config.vocab_size = len(self.tokenizer) if discriminator_name: self.discriminator_config = ElectraConfig.from_pretrained(discriminator_name) elif self.args.model_name: self.discriminator_config = ElectraConfig.from_pretrained( os.path.join(self.args.model_name, "discriminator_config"), **kwargs, ) else: self.discriminator_config = ElectraConfig(**self.args.discriminator_config, **kwargs) if new_tokenizer: self.discriminator_config.vocab_size = len(self.tokenizer) if self.args.block_size <= 0: self.args.block_size = min(self.args.max_seq_length, self.tokenizer.max_len) else: self.args.block_size = min(self.args.block_size, self.tokenizer.max_len, self.args.max_seq_length) if self.args.model_name: if self.args.model_type == "electra": if self.args.model_name == "electra": generator_model = ElectraForMaskedLM.from_pretrained(generator_name) discriminator_model = ElectraForPreTraining.from_pretrained(discriminator_name) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) self.model.generator_model = generator_model self.model.discriminator_model = discriminator_model else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, generator_config=self.generator_config, discriminator_config=self.discriminator_config, **kwargs, ) self.model.load_state_dict(torch.load(os.path.join(self.args.model_name, "pytorch_model.bin"))) else: self.model = model_class.from_pretrained( model_name, config=self.config, cache_dir=self.args.cache_dir, **kwargs, ) else: logger.info(" Training language model from scratch") if self.args.model_type == "electra": generator_model = ElectraForMaskedLM(config=self.generator_config) discriminator_model = ElectraForPreTraining(config=self.discriminator_config) self.model = ElectraForLanguageModelingModel( config=self.config, generator_model=generator_model, discriminator_model=discriminator_model, generator_config=self.generator_config, discriminator_config=self.discriminator_config, tie_generator_and_discriminator_embeddings=self.args.tie_generator_and_discriminator_embeddings, ) model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) else: self.model = model_class(config=self.config) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) if model_type in ["camembert", "xlmroberta"]: warnings.warn( f"use_multiprocessing automatically disabled as {model_type}" " fails when using multiprocessing for feature conversion." ) self.args.use_multiprocessing = False if self.args.wandb_project and not wandb_available: warnings.warn("wandb_project specified but wandb is not available. Wandb disabled.") self.args.wandb_project = None
print(tokenized_text) # Convert token to vocabulary indices token_ids = tokenizer.convert_tokens_to_ids(tokenized_text) token_type_ids = [0] * len(token_ids) print(token_ids) print(token_type_ids) # segment_ids # Convert inputs to PyTorch tensors token_ids_tensor = torch.tensor([token_ids]).to('cuda') token_type_ids_tensor = torch.tensor([token_type_ids]).to('cuda') ## 2 # Load pre-trained model (weights) model = ElectraForMaskedLM.from_pretrained('monologg/koelectra-base-discriminator') # Set the model in evaluation mode to deactivate the DropOut modules # This is IMPORTANT to have reproducible results during evaluation! model.eval() model.to('cuda') ## 3 # Predict all tokens with torch.no_grad(): outputs = model(token_ids_tensor, token_type_ids=token_type_ids_tensor) predictions = outputs[0] print(predictions) predicted_index = torch.argmax(predictions[0, masked_index]).item() predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
from transformers import ElectraForMaskedLM, ElectraTokenizer if __name__ == '__main__': parser = argparse.ArgumentParser(description='Unsupervised training') parser.add_argument("--model_path", type=str, default="", help="Pretrained Electra Model Path") parser.add_argument("--emb_out_path", type=str, default="", help="Electra Embedding Output Path") args = parser.parse_args() electra_model = ElectraForMaskedLM.from_pretrained(args.model_path) tokenizer = ElectraTokenizer.from_pretrained(args.model_path) vocab_size, emb_dim = electra_model.electra.embeddings.word_embeddings.weight.size( ) assert vocab_size == tokenizer.vocab_size vectors = torch.zeros((vocab_size, emb_dim), dtype=torch.float32) dico = [] for idx in range(vocab_size): token = tokenizer.ids_to_tokens.get(idx) assert token is not None dico.append(token) vectors[idx] = electra_model.electra.embeddings.word_embeddings.weight[