def bertForMaskedLM(*args, **kwargs): """ BertForMaskedLM includes the BertModel Transformer followed by the (possibly) pre-trained masked language modeling head. Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> masked_index = 8 >>> tokenized_text[masked_index] = '[MASK]' >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertForMaskedLM >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased') >>> model.eval() # Predict all tokens >>> with torch.no_grad(): predictions = model(tokens_tensor, segments_tensors) >>> predicted_index = torch.argmax(predictions[0, masked_index]).item() >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] 'henson' """ model = BertForMaskedLM.from_pretrained(*args, **kwargs) return model
def load_bertmodel(modelname): if modelname: tokenizer = BertJapaneseTokenizer.from_pretrained(modelname) model = BertForMaskedLM.from_pretrained(modelname) else: tokenizer, model = None, None return tokenizer, model
def __init__(self, args: argparse.Namespace): super().__init__() self.args = args self.bert_config = BertConfig.from_pretrained(self.args.bert_path) self.model = BertForMaskedLM(self.bert_config) self.loss_fn = CrossEntropyLoss(reduction="none") self.train_acc = MaskedAccuracy() self.valid_acc = MaskedAccuracy()
def __init__(self, device, keyword_model_file, model_file=None, n_kws=15): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.vocab_size = self.tokenizer.vocab_size self.n_kws = n_kws self.mask_id = 103 self.sep_id = 102 self.kw_ex = KeywordExtractor(n_kws=self.n_kws) self.kw_ex.reload(keyword_model_file) self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") self.device = device self.model.to(self.device) if model_file is not None: self.reload_model(model_file)
def __init__(self, device, model_file=None, n_kws=15): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.vocab_size = self.tokenizer.vocab_size self.n_kws = n_kws self.mask_id = 103 self.sep_id = 102 self.kw_ex = KeywordExtractor(n_kws=self.n_kws) # self.kw_ex.reload(keyword_model_file) self.model = BertForMaskedLM.from_pretrained("bert-base-uncased") self.device = device self.model.to(self.device) if model_file is not None: print("Model:", "[{}]".format(model_file.split('/')[-1]), "is loaded.") self.reload_model(model_file)
import torch from transformers.tokenization_bert_japanese import BertJapaneseTokenizer from transformers.modeling_bert import BertForMaskedLM, BertConfig import MeCab # Load the models tokenizer = BertJapaneseTokenizer.from_pretrained('model/') config = BertConfig.from_json_file('model/bert_base_32k_config.json') model = BertForMaskedLM.from_pretrained('model/model.ckpt-580000_pytorch.bin', config=config) m = MeCab.Tagger("-Ochasen") def sent_emb(text): print('text:', text) input_ids = tokenizer.encode(text, return_tensors='pt') print('tokenizer.conert:', tokenizer.convert_ids_to_tokens(input_ids[0].tolist())) masked_index = torch.where( input_ids == tokenizer.mask_token_id)[1].tolist()[0] print('masked index:', masked_index) result = model(input_ids) pred_ids = result[0][:, masked_index].topk(10).indices.tolist()[0] output = [] for pred_id in pred_ids: output_ids = input_ids.tolist()[0] output_ids[masked_index] = pred_id text = ''.join(tokenizer.decode(output_ids)) #print(text)
def __init__(self, vocab: Vocabulary, pretrained_model: str = None, requires_grad: bool = True, predictions_file=None, layer_freeze_regexes: List[str] = None, probe_type: str = None, loss_on_all_vocab: bool = False, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._loss_on_all_vocab = loss_on_all_vocab self._predictions_file = predictions_file # TODO move to predict if predictions_file is not None and os.path.isfile(predictions_file): os.remove(predictions_file) self._pretrained_model = pretrained_model if 'roberta' in pretrained_model: self._padding_value = 1 # The index of the RoBERTa padding token if loss_on_all_vocab: self._transformer_model = RobertaForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained( pretrained_model) elif 'xlnet' in pretrained_model: self._padding_value = 5 # The index of the XLNet padding token self._transformer_model = XLNetLMHeadModel.from_pretrained( pretrained_model) elif 'albert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = AlbertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token elif 'bert' in pretrained_model: if loss_on_all_vocab: self._transformer_model = BertForMaskedLM.from_pretrained( pretrained_model) else: self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained( pretrained_model) self._padding_value = 0 # The index of the BERT padding token else: assert (ValueError) if probe_type == 'MLP': layer_freeze_regexes = ["embeddings", "encoder", "pooler"] elif probe_type == 'linear': layer_freeze_regexes = [ "embeddings", "encoder", "pooler", "dense", "LayerNorm", "layer_norm" ] for name, param in self._transformer_model.named_parameters(): if layer_freeze_regexes and requires_grad: grad = not any( [bool(re.search(r, name)) for r in layer_freeze_regexes]) else: grad = requires_grad if grad: param.requires_grad = True else: param.requires_grad = False # make sure decode gredients are on. if 'roberta' in pretrained_model: self._transformer_model.lm_head.decoder.weight.requires_grad = True self._transformer_model.lm_head.bias.requires_grad = True elif 'albert' in pretrained_model: pass elif 'bert' in pretrained_model: self._transformer_model.cls.predictions.decoder.weight.requires_grad = True self._transformer_model.cls.predictions.bias.requires_grad = True transformer_config = self._transformer_model.config transformer_config.num_labels = 1 self._output_dim = self._transformer_model.config.hidden_size self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() self._debug = 2
def run_pplm_example_bert(pretrained_model="bert-base-cased", mask_prob=0.5, do_selective_mask=True, cond_text="", num_samples=1, bag_of_words=None, discrim=None, discrim_weights=None, discrim_meta=None, class_label=-1, length=100, stepsize=0.02, temperature=1.0, top_k=10, sample=True, num_iterations=3, grad_length=10000, horizon_length=1, decay=False, gamma=1.5, gm_scale=0.9, kl_scale=0.01, seed=0, no_cuda=False, colorama=False, strategy='pick_best', verbosity='regular', return_sent=False): ##This is the main function for bert # set Random seed torch.manual_seed(seed) np.random.seed(seed) # set verbosiry verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR) # set the device device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu" ###set discriminator? TODO need to figure this part out. where is this used #Modifies the global variables if discrim == 'generic': set_generic_model_params(discrim_weights, discrim_meta) if discrim is not None: discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][ "pretrained_model"] if pretrained_model != discriminator_pretrained_model: pretrained_model = discriminator_pretrained_model if verbosity_level >= REGULAR: print("discrim = {}, pretrained_model set " "to discriminator's = {}".format(discrim, pretrained_model)) # load pretrained model model = BertForMaskedLM.from_pretrained(pretrained_model, output_hidden_states=True) model.to(device) model.eval() # load tokenizer tokenizer = BertTokenizer.from_pretrained(pretrained_model) # Freeze Bert weights #!! this is interesting, i should also do this for my code. for param in model.parameters(): param.requires_grad = False # figure out conditioning text # if uncond, use start of sentence as the prompt # we need to change this into a whole sentence raw_text = cond_text while not raw_text: print("Did you forget to add `--cond_text`? ") raw_text = input("Model prompt >>> ") ##Different: we are also adding eos token now (as opposed to only bos) tokenized_cond_text = tokenizer.encode(raw_text) print("= Original sentence =") print(tokenizer.decode(tokenized_cond_text)) print() #randomly mask out a certain percentage of tokens or do_selective sent_len = len(tokenized_cond_text) - 2 # masked_indices = np.random.choice( range(1, len(tokenized_cond_text)-1), int(sent_len * mask_prob)) #add a function to mask out indices that if discrim is not None and do_selective_mask: classifier, class_id = get_classifier(discrim, class_label, device) masked_indices, init_score, masked_score = selective_mask( raw_text, mask_prob, model, tokenizer, classifier, class_id, device, strategy) orig_scores = [ init_score, masked_score, ] # masked_indices = np.array([5,6,7]) # get the mask labels # F**k they changed the ignore_index!!!! masked_lm_labels = [[-100 for _ in range(len(tokenized_cond_text))]] for ind in masked_indices: masked_lm_labels[0][ind] = tokenized_cond_text[ind] masked_lm_labels = torch.tensor(masked_lm_labels, device=device, dtype=torch.long) for ind in masked_indices: tokenized_cond_text[ind] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) #PRINT the masked version of the input_text print("After masking") masked_text = tokenizer.decode(tokenized_cond_text) print(masked_text) # generate unperturbed and perturbed texts # full_text_generation returns: # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time # bert-completed sentence without perterbing unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation_bert( model=model, tokenizer=tokenizer, context=tokenized_cond_text, masked_indices=masked_indices, masked_lm_labels=masked_lm_labels, device=device, num_samples=num_samples, bag_of_words=bag_of_words, discrim=discrim, class_label=class_label, stepsize=stepsize, temperature=temperature, top_k=top_k, sample=sample, num_iterations=num_iterations, grad_length=grad_length, horizon_length=horizon_length, decay=decay, gamma=gamma, gm_scale=gm_scale, kl_scale=kl_scale, verbosity_level=verbosity_level) # untokenize unperturbed text print('UNPERT\n') unpert_gen_text = tokenizer.decode(unpert_gen_tok_text) if verbosity_level >= REGULAR: print("=" * 80) print("= Unperturbed generated text =") print(unpert_gen_text) print() generated_texts = [] bow_word_ids = set() if bag_of_words and colorama: bow_indices = get_bag_of_words_indices_bert(bag_of_words.split(";"), tokenizer) for single_bow_list in bow_indices: # filtering all words in the list composed of more than 1 token filtered = list(filter(lambda x: len(x) <= 1, single_bow_list)) # w[0] because we are sure w has only 1 item because previous fitler bow_word_ids.update(w[0] for w in filtered) # iterate through the perturbed texts for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts): try: # untokenize unperturbed text if colorama: import colorama pert_gen_text = '' for word_id in pert_gen_tok_text.tolist()[0]: if word_id in bow_word_ids: pert_gen_text += '{}{}{}'.format( colorama.Fore.RED, tokenizer.decode([word_id]), colorama.Style.RESET_ALL) else: pert_gen_text += tokenizer.decode([word_id]) else: pert_gen_text = tokenizer.decode(pert_gen_tok_text) print("= Perturbed generated text {} =".format(i + 1)) print(pert_gen_text) print() except: pass # keep the prefix, perturbed seq, original seq for each index # return should contain: masked sentence, pert_gen_text, unpert_gen_text # scores = [initial_score, score_after_masking, score_after_filling_in] new_score = get_score(pert_gen_tok_text, model, classifier, device) generated_texts.append((pert_gen_text, unpert_gen_text, new_score)) if return_sent: return [masked_text, orig_scores, generated_texts] return
def train_cbert_and_augment(args): task_name = args.task_name os.makedirs(args.output_dir, exist_ok=True) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True os.makedirs(args.output_dir, exist_ok=True) processor = get_task_processor(task_name, args.data_dir) label_list = processor.get_labels(task_name) # load train and dev data train_examples = processor.get_train_examples() dev_examples = processor.get_dev_examples() tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True, cache_dir=args.cache) model = BertForMaskedLM.from_pretrained(BERT_MODEL, cache_dir=args.cache) if len(label_list) > 2: model.bert.embeddings.token_type_embeddings = torch.nn.Embedding( len(label_list), 768) model.bert.embeddings.token_type_embeddings.weight.data.normal_( mean=0.0, std=0.02) model.to(device) # train data train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, args.seed) train_data = prepare_data(train_features) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #dev data dev_features = convert_examples_to_features(dev_examples, label_list, args.max_seq_length, tokenizer, args.seed) dev_data = prepare_data(dev_features) dev_sampler = SequentialSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.train_batch_size) num_train_steps = int( len(train_features) / args.train_batch_size * args.num_train_epochs) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) # Prepare optimizer t_total = num_train_steps no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) best_dev_loss = float('inf') for epoch in trange(int(args.num_train_epochs), desc="Epoch"): avg_loss = 0. model.train() for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[1], 'attention_mask': batch[2], 'token_type_ids': batch[3], 'masked_lm_labels': batch[4] } outputs = model(**inputs) loss = outputs[0] optimizer.zero_grad() loss.backward() avg_loss += loss.item() optimizer.step() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0. # eval on dev after every epoch dev_loss = compute_dev_loss(model, dev_dataloader) print("Epoch {}, Dev loss {}".format(epoch, dev_loss)) if dev_loss < best_dev_loss: best_dev_loss = dev_loss print("Saving model. Best dev so far {}".format(best_dev_loss)) save_model_path = os.path.join(args.output_dir, 'best_cbert.pt') torch.save(model.state_dict(), save_model_path) # augment data using the best model augment_train_data(model, tokenizer, train_data, label_list, args)
def __init__(self, model, train_dataset, trainer_config, writer, logger=None, test_dataset=None, valid_dataset=None, n_jobs=0, label_smoothing=0, device=torch.device('cuda'), evaluate_full_sequences=False, ignore_idxs=[], full_input=False, max_length=511, max_y_length=80, new_dataset=False, best_model_path='', no_persona=False, mixup=False, mixup_mode='alternate', mixup_dataset=None, mixup_ratio=0.15, bert_mixup=False, replace=False, pointer_gen=False): if logger is None: self.logger = logging.getLogger(__file__) else: self.logger = logger self.train_batch_size = trainer_config.train_batch_size self.test_batch_size = trainer_config.test_batch_size self.lr = trainer_config.lr self.lr_warmup = trainer_config.lr_warmup self.weight_decay = trainer_config.weight_decay self.batch_split = trainer_config.batch_split self.s2s_weight = 1 self.single_input = True self.clip_grad = trainer_config.clip_grad self.n_epochs = trainer_config.n_epochs self.linear_schedule = trainer_config.linear_schedule self.patience = trainer_config.patience self.model_saving_interval = trainer_config.model_saving_interval self.device = device self.no_persona = no_persona self.evaluate_full_sequences = evaluate_full_sequences self.global_step = 0 self.full_input = full_input self.max_length = max_length self.max_y_length = max_y_length self.new_dataset = new_dataset self.best_ppl = 1e5 self.best_model_path = best_model_path self.mixup_mode = mixup_mode self.replace = replace self.mixup = mixup self.mixup_dataset = mixup_dataset self.mixup_ratio = mixup_ratio self.model_type = 'pretrain' self.patience_cnt = 0 self.stop_training = False self.pointer_gen = pointer_gen self.model = model.to(device) self.criterion = LabelSmoothingLoss( n_labels=self.model.n_embeddings, smoothing=label_smoothing, ignore_index=self.model.padding_idx).to(device) param_optimizer = list(self.model.named_parameters()) # Here we should remove parameters which are not used during to avoid breaking apex with None grads self.loss_weight = None no_decay = ['bias', 'loss'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': self.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] base_optimizer = Adam(optimizer_grouped_parameters, lr=self.lr) if not self.linear_schedule: self.optimizer = NoamOpt(self.model.embeddings_size, self.lr_warmup, base_optimizer, lr=self.lr, linear_schedule=False, loss_weight=self.loss_weight) else: total_steps = len( train_dataset) * self.n_epochs // self.train_batch_size self.optimizer = NoamOpt(self.model.embeddings_size, self.lr_warmup, base_optimizer, linear_schedule=True, lr=self.lr, total_steps=total_steps, loss_weight=self.loss_weight) train_sampler = RandomSampler(train_dataset) self.train_dataloader = DataLoader(train_dataset, batch_size=self.train_batch_size // self.batch_split, sampler=train_sampler, num_workers=n_jobs, collate_fn=self.collate_func) self.train_dataset = train_dataset # used to sample negative examples if test_dataset is not None: # only do evaluation on main process self.test_dataloader = DataLoader(test_dataset, batch_size=self.test_batch_size, shuffle=False, num_workers=n_jobs, collate_fn=self.collate_func) if valid_dataset is not None: self.valid_dataloader = DataLoader(valid_dataset, batch_size=self.test_batch_size, shuffle=False, num_workers=n_jobs, collate_fn=self.collate_func) self.bert_mixup = bert_mixup if bert_mixup: self.bert_model = BertForMaskedLM.from_pretrained( './bert_model').to(device) self.bert_tokenizer = BertTokenizer.from_pretrained('./bert_model') self.vocab = train_dataset.vocab self.writer = writer if isinstance(self.model, TransformerSeq2Seq): self.model_type = 'seq2seq'
import torch from flask import Flask, request, render_template from transformers.modeling_bert import BertForMaskedLM from transformers.tokenization_bert_japanese import BertJapaneseTokenizer app = Flask(__name__) model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) model = BertForMaskedLM.from_pretrained(model_name) model.eval() def get_prediction(s): assert '[MASK]' in s input_ids = tokenizer.encode(s, return_tensors="pt") masked_index = torch.where( input_ids == tokenizer.mask_token_id)[1].tolist()[0] with torch.no_grad(): # (1, seq_len, vocab_size) logits, = model(input_ids) # (1, vocab_size) probs_for_mask = torch.softmax(logits[0, masked_index], dim=-1) topk_probs, topk_indices = torch.topk(probs_for_mask, k=10) return [tokenizer.decode([i]) for i in topk_indices.tolist()], topk_probs.tolist() @app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': s = request.form['sent']