def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint) model.save_pretrained(args.output)
def init_model(self, device): """Initialize the language model and send it to the given device Note: Transformers v.4 and higher made default return_dict=True. Args: device (str): torch device (usually "cpu" or "cuda") Returns: model: a model for masked language modeling torch model """ model = None if self.model_name.lower().find('albert') >= 0: try: model = AlbertForMaskedLM.from_pretrained( self.model_name, return_dict=False).to(device) except: model = AlbertForMaskedLM.from_pretrained( self.model_name).to(device) else: try: model = BertForMaskedLM.from_pretrained( self.model_name, return_dict=False).to(device) except: model = BertForMaskedLM.from_pretrained( self.model_name).to(device) model.eval() return model
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForMaskedLM(config) load_tf_weights_in_albert(model, config, tf_checkpoint_path) print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert( model, config, args.checkpoint) model.save_pretrained(args.output) tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True) tokenizer.save_pretrained(args.output)
def albert_convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert # Initialise PyTorch model config = AlbertConfig.from_json_file(albert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = AlbertForMaskedLM(config) # Load weights from tf checkpoint load_tf_weights_in_albert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, vocab: Vocabulary, model_name: str = "bert-base", multi_choice: bool = False): super().__init__(vocab) self._model = None self._loss = CrossEntropyLoss() self.is_multi_choice = multi_choice if model_name.startswith('bert'): if self.is_multi_choice: self._model = BertMultiChoiceMLM.from_pretrained(model_name) else: self._model = BertForMaskedLM.from_pretrained(model_name) elif 'roberta' in model_name: if self.is_multi_choice: self._model = RobertaMultiChoiceMLM.from_pretrained(model_name) else: self._model = RobertaForMaskedLM.from_pretrained(model_name) elif 'albert' in model_name: self._model = AlbertForMaskedLM.from_pretrained(model_name) elif 'xlnet' in model_name: self._model = XLNetLMHeadModel.from_pretrained(model_name) else: raise ("Riquiered model is not supported.")
def __init__(self, config): super(LMDecodingModel, self).__init__() self.config = config self.dep_tree_baseline = config[MODEL_TYPE] == DEP_TREETRAIN_BASELINE self.albert = AlbertForMaskedLM.from_pretrained('albert-base-v2') self.albert_tokenizer = AlbertTokenizer.from_pretrained( 'albert-base-v2')
def __init__(self, transformer_model, is_train): super(LMNER, self).__init__() config = AlbertConfig.from_pretrained(transformer_model) self.transformer_model = AlbertForMaskedLM.from_pretrained( transformer_model, config=config) # 是否对bert进行训练 for name, param in self.transformer_model.named_parameters(): param.requires_grad = is_train
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = AlbertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) result = { "loss": loss, "prediction_scores": prediction_scores, } self.parent.assertListEqual( list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def setUp(self): super(TestAlbertMaskModel, self).setUp() albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_base_hf" # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_large_hf" # albert_pre_train = "/Users/Vander/Code/pytorch_col/albert_chinese_xlarge_hf" self.tokenizer = BertTokenizer.from_pretrained(albert_pre_train) self.mask_model = AlbertForMaskedLM.from_pretrained(albert_pre_train) self.mask_token = self.tokenizer.mask_token self.mask_id = self.tokenizer.mask_token_id
def load_HFpretrained_weights(self): hf_state_dict = AlbertForMaskedLM.from_pretrained( FLAGS.hf_model_handle).state_dict() repl = { "albert.embeddings": 'embedder', 'word_embeddings': 'idx_to_embedding', 'albert.encoder.embedding_hidden_mapping_in': 'embedder.embedding_to_hidden', 'albert.encoder.albert_layer_groups.0.albert_layers.0': 'shared_encoder_block', 'attention.dense': 'multihead_attention.project_o', 'attention': 'multihead_attention', 'full_layer_layer_norm': 'feedforward.LayerNorm', 'query': 'project_q', 'key': 'project_k', 'value': 'project_v', 'ffn.': 'feedforward.linear_in.', 'ffn_output': 'feedforward.linear_out', 'predictions': 'lm_head', } # use these three lines to do the replacement repl = dict((re.escape(k), v) for k, v in repl.items()) pattern = re.compile("|".join(repl.keys())) updated_hf_state_dict = OrderedDict( (pattern.sub(lambda m: repl[re.escape(m.group(0))], k), v) for k, v in hf_state_dict.items()) # Allow for cutting the sequence length short updated_hf_state_dict[ 'embedder.position_embeddings.weight'] = updated_hf_state_dict[ 'embedder.position_embeddings.weight'][:FLAGS. max_seq_length, :].clone( ) missing, unexpected = self.load_state_dict(updated_hf_state_dict, strict=False) # Allowed discrepancies: don't care about pooler, and have optional relative attention bias, + there is a 'lm_head.bias' that is only used to set lm head decoder bias to zero, which I' currently ignoring :P ignored_hf_parameters = [ 'pooler', 'position_embeddings', 'lm_head.bias' ] allowed_from_scratch_params = [ 'relative_attention_bias', 'top_down_regressor', 'combiner', 'shared_top_down_predictor', 'shared_from_left_predictor', 'shared_from_right_predictor' ] for m in missing: if not any([s in m for s in allowed_from_scratch_params]): raise ValueError( f'Unexpected mismatch in loading state dict: {m} not present in pretrained.' ) for u in unexpected: if not any([s in u for s in ignored_hf_parameters]): raise ValueError( f'Unexpected mismatch in loading state dict: {u} in pretrained but not in current model.' ) log.info(f"Loaded pretrained weights from {FLAGS.hf_model_handle}")
def _from_pretrained(self, pretrain_name: str): r""" 根据模型名字,加载不同的模型. """ if 'albert' in pretrain_name: model = AlbertForMaskedLM.from_pretrained(pretrain_name) tokenizer = BertTokenizer.from_pretrained(pretrain_name) elif 'bert' in pretrain_name: tokenizer = AutoTokenizer.from_pretrained(pretrain_name) model = AutoModelWithLMHead.from_pretrained(pretrain_name) self.model = model self.tokenizer = tokenizer
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ): super(Transformer, self).__init__() self.config_keys = ['max_seq_length'] self.max_seq_length = max_seq_length config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) model_type = config.model_type if hasattr(config, 'model_type') else '' if model_type == 'albert': self.model = AlbertForMaskedLM.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) else: self.model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
def create_and_check_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def __init__(self, device): self.device = device self.bert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.bert_model.device(device) self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.albert_model.device(device)
def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'tiny': num_hidden_layers = 4 embedding_size = 128 hidden_size = 336 intermediate_size = 1344 num_attention_heads = 12 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 768 hidden_size = 768 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) model = AlbertForMaskedLM(config) return model
def create_and_check_albert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = AlbertForMaskedLM(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.vocab_size]) self.check_loss_output(result)
def __init__( self, model=None, tokenizer=None, model_name="bert-large-uncased", mask_token="***mask***", disable_gpu=False, ): self.mask_token = mask_token self.delemmatizer = Delemmatizer() self.device = torch.device( "cuda" if torch.cuda.is_available() and not disable_gpu else "cpu" ) print("using model:", model_name) print("device:", self.device) if not model: if "distilbert" in model_name: self.bert = DistilBertForMaskedLM.from_pretrained(model_name) elif "Albert" in model_name: self.bert = AlbertForMaskedLM.from_pretrained(model_name) else: self.bert = BertForMaskedLM.from_pretrained(model_name) self.bert.to(self.device) else: self.bert = model if not tokenizer: if "distilbert" in model_name: self.tokenizer = DistilBertTokenizer.from_pretrained(model_name) elif "Albert" in model_name: self.tokenizer = AlbertTokenizer.from_pretrained(bert-large-uncased) else: self.tokenizer = BertTokenizer.from_pretrained(model_name) else: self.tokenizer = tokenizer self.bert.eval()
def _contextual_model_init(self): """ 基于上个下文的词相似计算初始化,加载词典,模型 :return: 无 """ pretrain_name = self.model_path + self.model_params[ 'pre_train_model_path'] logging.info('pretrain_name', pretrain_name) if 'albert' in pretrain_name: self._contextual_model = AlbertForMaskedLM.from_pretrained( pretrain_name) self._contextual_tokenizer = BertTokenizer.from_pretrained( pretrain_name) elif 'ernie' in pretrain_name or 'roberta' in pretrain_name: self._contextual_tokenizer = BertTokenizer.from_pretrained( pretrain_name) self._contextual_model = BertModel.from_pretrained(pretrain_name) else: # elif 'bert' in pretrain_name: self._contextual_tokenizer = AutoTokenizer.from_pretrained( pretrain_name) model_config = AutoConfig.from_pretrained(pretrain_name) self._contextual_model = AutoModel.from_pretrained( pretrain_name, config=model_config)
def main(): random.seed(1012) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) chars = string.ascii_lowercase number_of_entity_trials = 10 tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2') names = proc.generate_pairs_of_random_names(number_of_pairs=100) with open("../data/truism_data/social_data_sentences_2.json", "r") as f: social_sents = json.load(f) with open("../data/truism_data/social_data_2.json", "r") as f: social_config = json.load(f) logger.info("finished reading in social data") output_df = run_pipeline(model=model, tokenizer=tokenizer, fictitious_entities=names, sentences=social_sents, config=social_config, number_of_entity_trials=number_of_entity_trials, logger=logger) output_df.to_csv( "../data/masked_word_result_data/albert_w_name/alberta_social_perf_2_{}.csv" .format(number_of_entity_trials), index=False) logger.info("finished saving social results")
from transformers import BertTokenizer, AlbertForMaskedLM import os # pretrained = 'voidful/albert_chinese_xlarge' pretrained = 'voidful/albert_chinese_large' tokenizer = BertTokenizer.from_pretrained(pretrained) model = AlbertForMaskedLM.from_pretrained(pretrained) model.save_pretrained('albert_model') tokenizer.save_pretrained('albert_model') os.remove("albert_model/special_tokens_map.json") os.remove("albert_model/tokenizer_config.json") os.system("mv albert_model ../")
# %% import torch import string from transformers import \ AlbertTokenizer, AlbertForMaskedLM,\ DistilBertTokenizer, DistilBertForMaskedLM, \ RobertaTokenizer, RobertaForMaskedLM albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') albert_model = AlbertForMaskedLM.from_pretrained('albert-base-v2').eval() albert_large_tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2') albert_large_model = AlbertForMaskedLM.from_pretrained( 'albert-large-v2').eval() distilbert_tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-cased') distilbert_model = DistilBertForMaskedLM.from_pretrained( 'distilbert-base-cased').eval() roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large') roberta_model = RobertaForMaskedLM.from_pretrained('roberta-large').eval() top_k = 10 def decode(tokenizer, pred_idx, top_clean): ignore_tokens = string.punctuation + '[PAD]' tokens = [] for w in pred_idx:
def main(tokenizer_path, dataset_path, save_path='alectra-small', max_steps=1e6, accumulate_grad_batches=1, gpus=None, num_tpu_cores=None, distributed_backend=None, val_check_interval=0.25, val_check_percent=0.25, generator_type='albert', num_hidden_groups=1, d_loss_weight=50, mlm_prob=0.15, learning_rate=5e-4, warmup_steps=10000, batch_size=128, num_workers=2, tie_embedding_proj=False, tie_encoder=True, shuffle=True, lr_schedule='linear', resume_from_checkpoint=None, use_polyaxon=False): # init tokenizer. only need it for the special chars. tokenizer = BertWordPieceTokenizer(tokenizer_path) # init generator. if generator_type == 'albert': generator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=3, num_attention_heads=1, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) generator = AlbertForMaskedLM(generator_config) elif generator_type == 'bert': generator_config = BertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=128, num_hidden_layers=3, num_attention_heads=1, intermediate_size=256, max_position_embeddings=128) generator = BertForMaskedLM(generator_config) tie_weights(generator.cls.predictions.decoder, generator.bert.embeddings.word_embeddings) else: raise Exception(f"invalid generator type: {generator_type}") # init discriminator. discriminator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=12, num_attention_heads=4, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) discriminator = AlbertForTokenClassification(discriminator_config) # tie the embeddingg weights. tie_weights(discriminator.base_model.embeddings.word_embeddings, generator.base_model.embeddings.word_embeddings) tie_weights(discriminator.base_model.embeddings.position_embeddings, generator.base_model.embeddings.position_embeddings) tie_weights(discriminator.base_model.embeddings.token_type_embeddings, generator.base_model.embeddings.token_type_embeddings) if generator_type == 'albert' and tie_encoder: print('tying albert encoder layers') discriminator.albert.encoder.albert_layer_groups = generator.albert.encoder.albert_layer_groups if generator_type == 'albert' and tie_embedding_proj: print('tying embedding projection layers') discriminator.albert.encoder.embedding_hidden_mapping_in = generator.albert.encoder.embedding_hidden_mapping_in # init training module. training_config = DiscLMTrainingModuleConfig(max_steps, d_loss_weight=d_loss_weight, save_path=save_path, weight_decay=0.01, learning_rate=learning_rate, epsilon=1e-6, lr_schedule=lr_schedule, warmup_steps=warmup_steps) if use_polyaxon: checkpoint_fn = polyaxon_checkpoint_fn else: checkpoint_fn = None lightning_module = DiscLMTrainingModule(generator, discriminator, training_config, checkpoint_fn=checkpoint_fn) # init trainer. trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches, gpus=gpus, num_tpu_cores=num_tpu_cores, distributed_backend=distributed_backend, max_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint, val_check_percent=val_check_percent, val_check_interval=val_check_interval) # init dataloaders. train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path, trainer, mlm_prob, batch_size, num_workers, shuffle) # train. trainer.fit(lightning_module, train_loader, val_loader) # save the model. output_path = os.path.join(save_path, 'discriminator', 'final') os.makedirs(output_path, exist_ok=True) lightning_module.discriminator.base_model.save_pretrained(output_path) if checkpoint_fn: checkpoint_fn(lightning_module)
def __init__(self) -> None: self.lists = {} # M-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_multilingual_tokenizer = BertTokenizerFast.from_pretrained( 'bert-base-multilingual-cased') self.bert_multilingual_model = BertForMaskedLM.from_pretrained( 'bert-base-multilingual-cased').eval() self.lists["M-BERT"] = { "Tokenizer": self.bert_multilingual_tokenizer, "Model": self.bert_multilingual_model } print("====================================") print("[BERT] Google Multilingual BERT loaded") print("====================================") # KR-BERT from transformers import BertTokenizerFast, BertForMaskedLM self.krbert_tokenizer = BertTokenizerFast.from_pretrained( 'snunlp/KR-Medium') self.krbert_model = BertForMaskedLM.from_pretrained( 'snunlp/KR-Medium').eval() self.lists["KR-Medium"] = { "Tokenizer": self.krbert_tokenizer, "Model": self.krbert_model } print("====================================") print("[BERT] KR-BERT loaded") print("====================================") # BERT from transformers import BertTokenizerFast, BertForMaskedLM self.bert_kor_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/bert-kor-base') self.bert_kor_model = BertForMaskedLM.from_pretrained( 'kykim/bert-kor-base').eval() self.lists["bert-kor-base"] = { "Tokenizer": self.bert_kor_tokenizer, "Model": self.bert_kor_model } print("====================================") print("[BERT] BERT-kor-base loaded") print("====================================") # ALBERT from transformers import AlbertForMaskedLM self.albert_tokenizer = BertTokenizerFast.from_pretrained( 'kykim/albert-kor-base') self.albert_model = AlbertForMaskedLM.from_pretrained( 'kykim/albert-kor-base').eval() self.lists["albert-kor-base"] = { "Tokenizer": self.albert_tokenizer, "Model": self.albert_model } print("====================================") print("[BERT] ALBERT-kor-base loaded") print("====================================") # XLM-Roberta from transformers import XLMRobertaTokenizerFast, XLMRobertaForMaskedLM self.xlmroberta_tokenizer = XLMRobertaTokenizerFast.from_pretrained( 'xlm-roberta-base') self.xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained( 'xlm-roberta-base').eval() self.lists["xlm-roberta-base"] = { "Tokenizer": self.xlmroberta_tokenizer, "Model": self.xlmroberta_model } print("====================================") print("[BERT] XLM-Roberta-kor loaded") print("====================================") from transformers import BertTokenizerFast, EncoderDecoderModel self.tokenizer_bertshared = BertTokenizerFast.from_pretrained( "kykim/bertshared-kor-base") self.bertshared_model = EncoderDecoderModel.from_pretrained( "kykim/bertshared-kor-base") self.lists["bertshared-kor-base"] = { "Tokenizer": self.tokenizer_bertshared, "Model": self.bertshared_model } print("====================================") print("[Seq2seq + BERT] bertshared-kor-base loaded") print("====================================") # gpt3-kor-small_based_on_gpt2 from transformers import BertTokenizerFast, GPT2LMHeadModel self.tokenizer_gpt3 = BertTokenizerFast.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.model_gpt3 = GPT2LMHeadModel.from_pretrained( "kykim/gpt3-kor-small_based_on_gpt2") self.lists["gpt3-kor-small_based_on_gpt2"] = { "Tokenizer": self.tokenizer_gpt3, "Model": self.model_gpt3 } print("====================================") print("[GPT3] gpt3-small-based-on-gpt2 loaded") print("====================================") # electra-base-kor from transformers import ElectraTokenizerFast, ElectraModel self.tokenizer_electra = ElectraTokenizerFast.from_pretrained( "kykim/electra-kor-base") self.electra_model = ElectraModel.from_pretrained( "kykim/electra-kor-base") self.lists["electra-kor-base"] = { "Tokenizer": self.tokenizer_electra, "Model": self.electra_model } print("====================================") print("[ELECTRA] electra-kor-base loaded") print("====================================") from transformers import ElectraTokenizerFast, ElectraForQuestionAnswering self.electra_tokenizer_QA = ElectraTokenizerFast.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.electra_model_QA = ElectraForQuestionAnswering.from_pretrained( "monologg/koelectra-base-v3-finetuned-korquad") self.lists["electra-kor-QA"] = { "Tokenizer": self.electra_tokenizer_QA, "Model": self.electra_model_QA } print("====================================") print("[ELECTRA] koelectra-base-v3-finetuned-korquad loaded") print("====================================")
from transformers import AlbertForMaskedLM, AlbertTokenizer import torch tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2") model = AlbertForMaskedLM.from_pretrained("albert-large-v2") sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint." input = tokenizer.encode(sequence, return_tensors="pt") # 被Mask的字符的位置 mask_token_index = torch.where(input == tokenizer.mask_token_id)[1] #获取每个位置的logits, [batch_size, seq_length, vocab_size], torch.Size([1, 28, 30522]), 即最大的可能性 token_logits = model(input)[0] #只获取被mask处的单词的logits mask_token_logits = token_logits[0, mask_token_index, :] # 我们只取前5个可能的结果,从vocab_size众多结果中 top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() #打印前5个结果 for token in top_5_tokens: print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
parser = argparse.ArgumentParser() parser.add_argument("-t", "--type_of_model", default = 'albert', help = "pretrained LM type") parser.add_argument("-p", "--path_to_pytorch_models", help = "path to pytorch_model") parser.add_argument("--config_and_vocab", help = "path to config.json and vocab.model") parser.add_argument("-s", "--step", type = str, help = "pretrained step") parser.add_argument("-d", "--data", help = "path where you put your processed ontonotes data") parser.add_argument("-o", "--output", help = "output file") args = parser.parse_args() print("Reconstruction. step = ", args.step) if args.type_of_model == 'albert': tokenizer = AlbertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = AlbertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) elif args.type_of_model == 'bert': tokenizer = BertTokenizer(os.path.join(args.config_and_vocab, args.type_of_model, 'vocab.model')) config = BertConfig.from_json_file(os.path.join(args.config_and_vocab, args.type_of_model, 'config.json')) config.output_hidden_states = True model = BertForMaskedLM.from_pretrained(pretrained_model_name_or_path = None, config = config, state_dict = torch.load(os.path.join( args.path_to_pytorch_models, args.type_of_model, 'pytorch_model_' + args.step + '.bin'))) else: raise NotImplementedError("The given model type %s is not supported" % args.type_of_model) device = 'cuda' if torch.cuda.is_available else 'cpu' model.eval().to(device)
def init_process(local_rank, backend, config, albert_config, logger): os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' torch.cuda.set_device(local_rank) random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) torch.cuda.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if local_rank != 0: logger.setLevel(logging.WARNING) if local_rank == 0: writer = SummaryWriter() if not os.path.exists("save"): os.mkdir("save") save_path = "save/model_{}.pt".format(re.sub("\s+", "_", time.asctime())) reader = Reader(config) start = time.time() logger.info("Loading data...") reader.load_data() end = time.time() logger.info("Loaded. {} secs".format(end-start)) model = AlbertForMaskedLM(albert_config).cuda() optimizer = Adam(model.parameters(), lr=config.lr) if config.save_path is not None: load(model, optimizer, config.save_path, local_rank) train.global_step = 0 train.max_iter = len(list(reader.make_batch("train"))) validate.max_iter = len(list(reader.make_batch("dev"))) min_loss = 1e+10 early_stop_count = config.early_stop_count # logger.info("Validate...") # loss = validate(model, reader, config, local_rank) # logger.info("loss: {:.4f}".format(loss)) for epoch in range(config.max_epochs): logger.info("Train...") start = time.time() if local_rank == 0: train_test(model, reader, optimizer, config, local_rank, writer) else: train_test(model, reader, optimizer, config, local_rank) exit(0) end = time.time() logger.info("epoch: {}, {:.4f} secs".format(epoch+1, end-start)) logger.info("Validate...") loss = validate(model, reader, config, local_rank) logger.info("loss: {:.4f}".format(loss)) if local_rank == 0: writer.add_scalar("Val/loss", loss, epoch+1) if loss < min_loss: # save model if local_rank == 0: save(model, optimizer, save_path) logger.info("Saved to {}.".format(os.path.abspath(save_path))) min_loss = loss early_stop_count = config.early_stop_count else: # ealry stopping if early_stop_count == 0: if epoch < config.min_epochs: early_stop_count += 1 logger.info("Too early to stop training.") logger.info("early stop count: {}".format(early_stop_count)) else: logger.info("Early stopped.") break elif early_stop_count == 2: lr = lr / 2 logger.info("learning rate schedule: {}".format(lr)) for param in optimizer.param_groups: param["lr"] = lr early_stop_count -= 1 logger.info("early stop count: {}".format(early_stop_count)) logger.info("Training finished.")
def run_benchmark(model_name, benchmark_file, results_file, logging_file): with open(benchmark_file, "r") as f: benchmark = json.load(f) model = AlbertForMaskedLM.from_pretrained(model_name) tokenizer = AlbertTokenizer.from_pretrained(model_name) fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer) # Each pattern will store its own statistics results = [] for pattern in patterns: result = {} result["false_positives"] = 0 result["false_negatives"] = 0 result["total_questions"] = 0 result["correct"] = 0 pattern["accuracy"] = 0.0 result["pattern"] = pattern["prompt"] results.append(result) with open(logging_file, "w") as log: for benchmark_question in benchmark: output = fill_mask(benchmark_question["question"]) output_str = output[0]["sequence"] + "\n" for o in output: output_str += str(o["token_str"][1:]) + " " + str(o["score"]) + "\n" print(output_str) log.write(output_str) # Update the correct patterns stats for result in results: if result["pattern"] == benchmark_question["pattern"]: result["total_questions"] += 1 if is_correct(output, benchmark_question["answer"]): result["correct"] += 1 print("correct") log.write("correct\n") else: print("incorrect") log.write("incorrect\n") if benchmark_question["answer"] == True: result["false_negatives"] += 1 else: result["false_positives"] += 1 break # Calculate each pattern's accuracy for result in results: result["accuracy"] = float(result["correct"])/result["total_questions"] # Calculate and append the overall statistics results.append(compute_overall_results(results)) results.append({"model_name": model_name, "datetime": str(datetime.datetime.now())}) # Store the results -- downside of no results until the end. with open(results_file, "w") as f: json.dump(results, f, indent=3)
Roberta = ModelInfo( RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True), RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta") XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__": sentences = [sample_sentences("sentences4lara.txt") for i in range(11)] sent_dict = dict(zip([str(x) for x in range(1, 11)], sentences)) sentence = sent_dict[sys.argv[2]] batch_size = 100
def evaluate(args): """ Evaluate a masked language model using CrowS-Pairs dataset. """ print("Evaluating:") print("Input:", args.input_file) print("Model:", args.lm_model) print("=" * 100) logging.basicConfig(level=logging.INFO) # load data into panda DataFrame df_data = read_data(args.input_file) # supported masked language models if args.lm_model == "bert": tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') uncased = True elif args.lm_model == "roberta": tokenizer = RobertaTokenizer.from_pretrained('roberta-large') model = RobertaForMaskedLM.from_pretrained('roberta-large') uncased = False elif args.lm_model == "albert": tokenizer = AlbertTokenizer.from_pretrained('albert-xxlarge-v2') model = AlbertForMaskedLM.from_pretrained('albert-xxlarge-v2') uncased = True model.eval() if torch.cuda.is_available(): model.to('cuda') mask_token = tokenizer.mask_token log_softmax = torch.nn.LogSoftmax(dim=0) vocab = tokenizer.get_vocab() with open(args.lm_model + ".vocab", "w") as f: f.write(json.dumps(vocab)) lm = {"model": model, "tokenizer": tokenizer, "mask_token": mask_token, "log_softmax": log_softmax, "uncased": uncased } # score each sentence. # each row in the dataframe has the sentid and score for pro and anti stereo. df_score = pd.DataFrame(columns=['sent_more', 'sent_less', 'sent_more_score', 'sent_less_score', 'score', 'stereo_antistereo', 'bias_type']) total_stereo, total_antistereo = 0, 0 stereo_score, antistereo_score = 0, 0 N = 0 neutral = 0 total = len(df_data.index) with tqdm(total=total) as pbar: for index, data in df_data.iterrows(): direction = data['direction'] bias = data['bias_type'] score = mask_unigram(data, lm) for stype in score.keys(): score[stype] = round(score[stype], 3) N += 1 pair_score = 0 pbar.update(1) if score['sent1_score'] == score['sent2_score']: neutral += 1 else: if direction == 'stereo': total_stereo += 1 if score['sent1_score'] > score['sent2_score']: stereo_score += 1 pair_score = 1 elif direction == 'antistereo': total_antistereo += 1 if score['sent2_score'] > score['sent1_score']: antistereo_score += 1 pair_score = 1 sent_more, sent_less = '', '' if direction == 'stereo': sent_more = data['sent1'] sent_less = data['sent2'] sent_more_score = score['sent1_score'] sent_less_score = score['sent2_score'] else: sent_more = data['sent2'] sent_less = data['sent1'] sent_more_score = score['sent2_score'] sent_less_score = score['sent1_score'] df_score = df_score.append({'sent_more': sent_more, 'sent_less': sent_less, 'sent_more_score': sent_more_score, 'sent_less_score': sent_less_score, 'score': pair_score, 'stereo_antistereo': direction, 'bias_type': bias }, ignore_index=True) df_score.to_csv(args.output_file) print('=' * 100) print('Total examples:', N) print('Metric score:', round((stereo_score + antistereo_score) / N * 100, 2)) print('Stereotype score:', round(stereo_score / total_stereo * 100, 2)) if antistereo_score != 0: print('Anti-stereotype score:', round(antistereo_score / total_antistereo * 100, 2)) print("Num. neutral:", neutral, round(neutral / N * 100, 2)) print('=' * 100) print()