class NemoBertTokenizer(TokenizerSpec): def __init__(self, pretrained_model=None, vocab_file=None, do_lower_case=True, max_len=None, do_basic_tokenize=True, never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")): if pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model) if "uncased" not in pretrained_model: self.tokenizer.basic_tokenizer.do_lower_case = False else: self.tokenizer = BertTokenizer(vocab_file, do_lower_case, max_len, do_basic_tokenize, never_split) self.vocab_size = len(self.tokenizer.vocab) self.never_split = never_split def text_to_tokens(self, text): tokens = self.tokenizer.tokenize(text) return tokens def tokens_to_text(self, tokens): text = self.tokenizer.convert_tokens_to_string(tokens) return remove_spaces(handle_quotes(text.strip())) def token_to_id(self, token): return self.tokens_to_ids([token])[0] def tokens_to_ids(self, tokens): ids = self.tokenizer.convert_tokens_to_ids(tokens) return ids def ids_to_tokens(self, ids): tokens = self.tokenizer.convert_ids_to_tokens(ids) return tokens def text_to_ids(self, text): tokens = self.text_to_tokens(text) ids = self.tokens_to_ids(tokens) return ids def ids_to_text(self, ids): tokens = self.ids_to_tokens(ids) tokens_clean = [t for t in tokens if t not in self.never_split] text = self.tokens_to_text(tokens_clean) return text def pad_id(self): return self.tokens_to_ids(["[PAD]"])[0] def bos_id(self): return self.tokens_to_ids(["[CLS]"])[0] def eos_id(self): return self.tokens_to_ids(["[SEP]"])[0]
class BertGeneration(object): def __init__(self, model_directory, vocab_file, lower=False): # Load pre-trained model (weights) self.model = BertForMaskedLM.from_pretrained(model_directory) self.model.eval() self.cuda = torch.cuda.is_available() if self.cuda: self.model = self.model.cuda() # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=lower) self.CLS = '[CLS]' self.SEP = '[SEP]' self.MASK = '[MASK]' self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0] self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0] self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0] def tokenize_batch(self, batch): return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch] def untokenize_batch(self, batch): return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch] def detokenize(self, sent): """ Roughly detokenizes (mainly undoes wordpiece) """ new_sent = [] for i, tok in enumerate(sent): if tok.startswith("##"): new_sent[len(new_sent) - 1] = new_sent[len(new_sent) - 1] + tok[2:] else: new_sent.append(tok) return new_sent def generate_step(self, out, gen_idx, temperature=None, top_k=0, sample=False, return_list=True): """ Generate a word from from out[gen_idx] args: - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size - gen_idx (int): location for which to generate for - top_k (int): if >0, only sample from the top k most probable words - sample (Bool): if True, sample from full distribution. Overridden by top_k """ logits = out[:, gen_idx] if temperature is not None: logits = logits / temperature if top_k > 0: kth_vals, kth_idx = logits.topk(top_k, dim=-1) dist = torch.distributions.categorical.Categorical(logits=kth_vals) idx = kth_idx.gather(dim=1, index=dist.sample().unsqueeze(-1)).squeeze(-1) elif sample: dist = torch.distributions.categorical.Categorical(logits=logits) idx = dist.sample().squeeze(-1) else: idx = torch.argmax(logits, dim=-1) return idx.tolist() if return_list else idx def get_init_text(self, seed_text, max_len, batch_size=1, rand_init=False): """ Get initial sentence by padding seed_text with either masks or random words to max_len """ batch = [ seed_text + [self.MASK] * max_len + [self.SEP] for _ in range(batch_size) ] #if rand_init: # for ii in range(max_len): # init_idx[seed_len+ii] = np.random.randint(0, len(tokenizer.vocab)) return self.tokenize_batch(batch) def printer(self, sent, should_detokenize=True): if should_detokenize: sent = self.detokenize(sent)[1:-1] print(" ".join(sent)) # This is the meat of the algorithm. The general idea is # 1. start from all masks # 2. repeatedly pick a location, mask the token at that location, and generate from the probability distribution given by BERT # 3. stop when converged or tired of waiting # We consider three "modes" of generating: # - generate a single token for a position chosen uniformly at random for a chosen number of time steps # - generate in sequential order (L->R), one token at a time # - generate for all positions at once for a chosen number of time steps # The `generate` function wraps and batches these three generation modes. In practice, we find that the first leads to the most fluent samples. # Generation modes as functions def parallel_sequential_generation(self, seed_text, batch_size=10, max_len=15, top_k=0, temperature=None, max_iter=300, burnin=200, cuda=False, print_every=10, verbose=True): """ Generate for one random position at a timestep args: - burnin: during burn-in period, sample from full distribution; afterwards take argmax """ seed_len = len(seed_text) batch = self.get_init_text(seed_text, max_len, batch_size) for ii in range(max_iter): kk = np.random.randint(0, max_len) for jj in range(batch_size): batch[jj][seed_len + kk] = self.mask_id inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch) out = self.model(inp)[0] topk = top_k if (ii >= burnin) else 0 idxs = self.generate_step(out, gen_idx=seed_len + kk, top_k=topk, temperature=temperature, sample=(ii < burnin)) for jj in range(batch_size): batch[jj][seed_len + kk] = idxs[jj] if verbose and np.mod(ii + 1, print_every) == 0: for_print = self.tokenizer.convert_ids_to_tokens(batch[0]) for_print = for_print[:seed_len + kk + 1] + [ '(*)' ] + for_print[seed_len + kk + 1:] print("iter", ii + 1, " ".join(for_print)) return self.untokenize_batch(batch) def parallel_generation(self, seed_text, batch_size=10, max_len=15, top_k=0, temperature=None, max_iter=300, sample=True, cuda=False, print_every=10, verbose=True): """ Generate for all positions at each time step """ seed_len = len(seed_text) batch = self.get_init_text(seed_text, max_len, batch_size) for ii in range(max_iter): inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch) out = self.model(inp)[0] for kk in range(max_len): idxs = self.generate_step(out, gen_idx=seed_len + kk, top_k=top_k, temperature=temperature, sample=sample) for jj in range(batch_size): batch[jj][seed_len + kk] = idxs[jj] if verbose and np.mod(ii, print_every) == 0: print("iter", ii + 1, " ".join(self.tokenizer.convert_ids_to_tokens(batch[0]))) return self.untokenize_batch(batch) def sequential_generation(self, seed_text, batch_size=10, max_len=15, leed_out_len=15, top_k=0, temperature=None, sample=True, cuda=False): """ Generate one word at a time, in L->R order """ seed_len = len(seed_text) batch = self.get_init_text(seed_text, max_len, batch_size) for ii in range(max_len): inp = [ sent[:seed_len + ii + leed_out_len] + [self.sep_id] for sent in batch ] inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch) out = self.model(inp)[0] idxs = self.generate_step(out, gen_idx=seed_len + ii, top_k=top_k, temperature=temperature, sample=sample) for jj in range(batch_size): batch[jj][seed_len + ii] = idxs[jj] return self.untokenize_batch(batch) def generate(self, n_samples, seed_text="[CLS]", batch_size=10, max_len=25, generation_mode="parallel-sequential", sample=True, top_k=100, temperature=1.0, burnin=200, max_iter=500, cuda=False, print_every=1, leed_out_len=15): # main generation function to call sentences = [] n_batches = math.ceil(n_samples / batch_size) start_time = time.time() for batch_n in range(n_batches): if generation_mode == "parallel-sequential": batch = self.parallel_sequential_generation( seed_text, batch_size=batch_size, max_len=max_len, top_k=top_k, temperature=temperature, burnin=burnin, max_iter=max_iter, cuda=cuda, verbose=False) elif generation_mode == "sequential": batch = self.sequential_generation(seed_text, batch_size=batch_size, max_len=max_len, top_k=top_k, temperature=temperature, leed_out_len=leed_out_len, sample=sample, cuda=cuda) elif generation_mode == "parallel": batch = self.parallel_generation(seed_text, batch_size=batch_size, max_len=max_len, top_k=top_k, temperature=temperature, sample=sample, max_iter=max_iter, cuda=cuda, verbose=False) if (batch_n + 1) % print_every == 0: print("Finished batch %d in %.3fs" % (batch_n + 1, time.time() - start_time)) start_time = time.time() sentences += batch return sentences
bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab) print('Initialize BERT model from {}...'.format(args.bert_model)) config = BertConfig.from_json_file('./bert-base-uncased/config.json') bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config) while True: message = input('Enter your message: ').strip() tokens = bert_tokenizer.tokenize(message) if len(tokens) == 0: continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad(): logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None) logits = np.squeeze(logits[0], axis=0) probs = torch.softmax(logits, dim=-1) mask_cnt = 0 for idx, token in enumerate(tokens): if token == MASK: mask_cnt += 1 print('Top {} predictions for {}th {}:'.format(args.topk, mask_cnt, MASK)) topk_prob, topk_indices = torch.topk(probs[idx, :], args.topk) topk_tokens = bert_tokenizer.convert_ids_to_tokens(topk_indices.cpu().numpy()) for prob, tok in zip(topk_prob, topk_tokens): print('{} {}'.format(tok, prob)) print('='*80)
class BertGeneration(object): def __init__(self, model_directory, vocab_file, lower=False): # Load pre-trained model (weights) self.model = BertForMaskedLM.from_pretrained(model_directory) self.model.eval() self.cuda = torch.cuda.is_available() if self.cuda: self.model = self.model.cuda() # Load pre-trained model tokenizer (vocabulary) self.tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=lower) self.CLS = '[CLS]' self.SEP = '[SEP]' self.MASK = '[MASK]' self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0] self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0] self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0] def tokenize_batch(self, batch): return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch] def untokenize_batch(self, batch): return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch] def detokenize(self, sent): """ Roughly detokenizes (mainly undoes wordpiece) """ new_sent = [] for i, tok in enumerate(sent): if tok.startswith("##"): new_sent[len(new_sent) - 1] = new_sent[len(new_sent) - 1] + tok[2:] else: new_sent.append(tok) return new_sent def printer(self, sent, should_detokenize=True): if should_detokenize: sent = self.detokenize(sent)[1:-1] print(" ".join(sent)) def predict_masked(self, sent): tokens = ['[CLS]'] + sent + ['[SEP]'] target_indices = [i for i, x in enumerate(tokens) if x == '[MASK]'] input_ids = self.tokenizer.convert_tokens_to_ids(tokens) tens = torch.LongTensor(input_ids).unsqueeze(0) if self.cuda: tens = tens.cuda() try: res = self.model(tens)[0] except RuntimeError: # Error in the model vocabulary, remove when a corret model is trained return None target_tensor = torch.LongTensor(target_indices) if self.cuda: target_tensor = target_tensor.cuda() res = (torch.index_select(res, 1, target_tensor)) res = torch.narrow(torch.argsort(res, dim=-1, descending=True), -1, 0, 5) predicted = [] for mask in res[0, ]: candidates = self.tokenizer.convert_ids_to_tokens( [i.item() for i in mask]) predicted.append(candidates) return predicted