Exemplo n.º 1
0
class NemoBertTokenizer(TokenizerSpec):
    def __init__(self, pretrained_model=None,
                 vocab_file=None,
                 do_lower_case=True,
                 max_len=None,
                 do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if pretrained_model:
            self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
            if "uncased" not in pretrained_model:
                self.tokenizer.basic_tokenizer.do_lower_case = False
        else:
            self.tokenizer = BertTokenizer(vocab_file,
                                           do_lower_case,
                                           max_len,
                                           do_basic_tokenize,
                                           never_split)
        self.vocab_size = len(self.tokenizer.vocab)
        self.never_split = never_split

    def text_to_tokens(self, text):
        tokens = self.tokenizer.tokenize(text)
        return tokens

    def tokens_to_text(self, tokens):
        text = self.tokenizer.convert_tokens_to_string(tokens)
        return remove_spaces(handle_quotes(text.strip()))

    def token_to_id(self, token):
        return self.tokens_to_ids([token])[0]

    def tokens_to_ids(self, tokens):
        ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return ids

    def ids_to_tokens(self, ids):
        tokens = self.tokenizer.convert_ids_to_tokens(ids)
        return tokens

    def text_to_ids(self, text):
        tokens = self.text_to_tokens(text)
        ids = self.tokens_to_ids(tokens)
        return ids

    def ids_to_text(self, ids):
        tokens = self.ids_to_tokens(ids)
        tokens_clean = [t for t in tokens if t not in self.never_split]
        text = self.tokens_to_text(tokens_clean)
        return text

    def pad_id(self):
        return self.tokens_to_ids(["[PAD]"])[0]

    def bos_id(self):
        return self.tokens_to_ids(["[CLS]"])[0]

    def eos_id(self):
        return self.tokens_to_ids(["[SEP]"])[0]
Exemplo n.º 2
0
class BertGeneration(object):
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]

    def tokenize_batch(self, batch):
        return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch]

    def untokenize_batch(self, batch):
        return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch]

    def detokenize(self, sent):
        """ Roughly detokenizes (mainly undoes wordpiece) """
        new_sent = []
        for i, tok in enumerate(sent):
            if tok.startswith("##"):
                new_sent[len(new_sent) -
                         1] = new_sent[len(new_sent) - 1] + tok[2:]
            else:
                new_sent.append(tok)
        return new_sent

    def generate_step(self,
                      out,
                      gen_idx,
                      temperature=None,
                      top_k=0,
                      sample=False,
                      return_list=True):
        """ Generate a word from from out[gen_idx]
        
        args:
            - out (torch.Tensor): tensor of logits of size batch_size x seq_len x vocab_size
            - gen_idx (int): location for which to generate for
            - top_k (int): if >0, only sample from the top k most probable words
            - sample (Bool): if True, sample from full distribution. Overridden by top_k 
        """
        logits = out[:, gen_idx]
        if temperature is not None:
            logits = logits / temperature
        if top_k > 0:
            kth_vals, kth_idx = logits.topk(top_k, dim=-1)
            dist = torch.distributions.categorical.Categorical(logits=kth_vals)
            idx = kth_idx.gather(dim=1,
                                 index=dist.sample().unsqueeze(-1)).squeeze(-1)
        elif sample:
            dist = torch.distributions.categorical.Categorical(logits=logits)
            idx = dist.sample().squeeze(-1)
        else:
            idx = torch.argmax(logits, dim=-1)
        return idx.tolist() if return_list else idx

    def get_init_text(self, seed_text, max_len, batch_size=1, rand_init=False):
        """ Get initial sentence by padding seed_text with either masks or random words to max_len """
        batch = [
            seed_text + [self.MASK] * max_len + [self.SEP]
            for _ in range(batch_size)
        ]
        #if rand_init:
        #    for ii in range(max_len):
        #        init_idx[seed_len+ii] = np.random.randint(0, len(tokenizer.vocab))

        return self.tokenize_batch(batch)

    def printer(self, sent, should_detokenize=True):
        if should_detokenize:
            sent = self.detokenize(sent)[1:-1]
        print(" ".join(sent))

    # This is the meat of the algorithm. The general idea is
    # 1. start from all masks
    # 2. repeatedly pick a location, mask the token at that location, and generate from the probability distribution given by BERT
    # 3. stop when converged or tired of waiting

    # We consider three "modes" of generating:
    # - generate a single token for a position chosen uniformly at random for a chosen number of time steps
    # - generate in sequential order (L->R), one token at a time
    # - generate for all positions at once for a chosen number of time steps

    # The `generate` function wraps and batches these three generation modes. In practice, we find that the first leads to the most fluent samples.

    # Generation modes as functions

    def parallel_sequential_generation(self,
                                       seed_text,
                                       batch_size=10,
                                       max_len=15,
                                       top_k=0,
                                       temperature=None,
                                       max_iter=300,
                                       burnin=200,
                                       cuda=False,
                                       print_every=10,
                                       verbose=True):
        """ Generate for one random position at a timestep
        
        args:
            - burnin: during burn-in period, sample from full distribution; afterwards take argmax
        """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_iter):
            kk = np.random.randint(0, max_len)
            for jj in range(batch_size):
                batch[jj][seed_len + kk] = self.mask_id
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            topk = top_k if (ii >= burnin) else 0
            idxs = self.generate_step(out,
                                      gen_idx=seed_len + kk,
                                      top_k=topk,
                                      temperature=temperature,
                                      sample=(ii < burnin))
            for jj in range(batch_size):
                batch[jj][seed_len + kk] = idxs[jj]

            if verbose and np.mod(ii + 1, print_every) == 0:
                for_print = self.tokenizer.convert_ids_to_tokens(batch[0])
                for_print = for_print[:seed_len + kk + 1] + [
                    '(*)'
                ] + for_print[seed_len + kk + 1:]
                print("iter", ii + 1, " ".join(for_print))

        return self.untokenize_batch(batch)

    def parallel_generation(self,
                            seed_text,
                            batch_size=10,
                            max_len=15,
                            top_k=0,
                            temperature=None,
                            max_iter=300,
                            sample=True,
                            cuda=False,
                            print_every=10,
                            verbose=True):
        """ Generate for all positions at each time step """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_iter):
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            for kk in range(max_len):
                idxs = self.generate_step(out,
                                          gen_idx=seed_len + kk,
                                          top_k=top_k,
                                          temperature=temperature,
                                          sample=sample)
                for jj in range(batch_size):
                    batch[jj][seed_len + kk] = idxs[jj]

            if verbose and np.mod(ii, print_every) == 0:
                print("iter", ii + 1,
                      " ".join(self.tokenizer.convert_ids_to_tokens(batch[0])))

        return self.untokenize_batch(batch)

    def sequential_generation(self,
                              seed_text,
                              batch_size=10,
                              max_len=15,
                              leed_out_len=15,
                              top_k=0,
                              temperature=None,
                              sample=True,
                              cuda=False):
        """ Generate one word at a time, in L->R order """
        seed_len = len(seed_text)
        batch = self.get_init_text(seed_text, max_len, batch_size)

        for ii in range(max_len):
            inp = [
                sent[:seed_len + ii + leed_out_len] + [self.sep_id]
                for sent in batch
            ]
            inp = torch.tensor(batch).cuda() if cuda else torch.tensor(batch)
            out = self.model(inp)[0]
            idxs = self.generate_step(out,
                                      gen_idx=seed_len + ii,
                                      top_k=top_k,
                                      temperature=temperature,
                                      sample=sample)
            for jj in range(batch_size):
                batch[jj][seed_len + ii] = idxs[jj]

        return self.untokenize_batch(batch)

    def generate(self,
                 n_samples,
                 seed_text="[CLS]",
                 batch_size=10,
                 max_len=25,
                 generation_mode="parallel-sequential",
                 sample=True,
                 top_k=100,
                 temperature=1.0,
                 burnin=200,
                 max_iter=500,
                 cuda=False,
                 print_every=1,
                 leed_out_len=15):
        # main generation function to call
        sentences = []
        n_batches = math.ceil(n_samples / batch_size)
        start_time = time.time()
        for batch_n in range(n_batches):
            if generation_mode == "parallel-sequential":
                batch = self.parallel_sequential_generation(
                    seed_text,
                    batch_size=batch_size,
                    max_len=max_len,
                    top_k=top_k,
                    temperature=temperature,
                    burnin=burnin,
                    max_iter=max_iter,
                    cuda=cuda,
                    verbose=False)
            elif generation_mode == "sequential":
                batch = self.sequential_generation(seed_text,
                                                   batch_size=batch_size,
                                                   max_len=max_len,
                                                   top_k=top_k,
                                                   temperature=temperature,
                                                   leed_out_len=leed_out_len,
                                                   sample=sample,
                                                   cuda=cuda)
            elif generation_mode == "parallel":
                batch = self.parallel_generation(seed_text,
                                                 batch_size=batch_size,
                                                 max_len=max_len,
                                                 top_k=top_k,
                                                 temperature=temperature,
                                                 sample=sample,
                                                 max_iter=max_iter,
                                                 cuda=cuda,
                                                 verbose=False)

            if (batch_n + 1) % print_every == 0:
                print("Finished batch %d in %.3fs" %
                      (batch_n + 1, time.time() - start_time))
                start_time = time.time()

            sentences += batch
        return sentences
Exemplo n.º 3
0
    bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab)
    print('Initialize BERT model from {}...'.format(args.bert_model))
    config = BertConfig.from_json_file('./bert-base-uncased/config.json')
    bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config)

    while True:
        message = input('Enter your message: ').strip()
        tokens = bert_tokenizer.tokenize(message)
        if len(tokens) == 0:
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
            logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None)
        logits = np.squeeze(logits[0], axis=0)
        probs = torch.softmax(logits, dim=-1)

        mask_cnt = 0
        for idx, token in enumerate(tokens):
            if token == MASK:
                mask_cnt += 1
                print('Top {} predictions for {}th {}:'.format(args.topk, mask_cnt, MASK))
                topk_prob, topk_indices = torch.topk(probs[idx, :], args.topk)
                topk_tokens = bert_tokenizer.convert_ids_to_tokens(topk_indices.cpu().numpy())
                for prob, tok in zip(topk_prob, topk_tokens):
                    print('{} {}'.format(tok, prob))
                print('='*80)
Exemplo n.º 4
0
class BertGeneration(object):
    def __init__(self, model_directory, vocab_file, lower=False):

        # Load pre-trained model (weights)

        self.model = BertForMaskedLM.from_pretrained(model_directory)
        self.model.eval()
        self.cuda = torch.cuda.is_available()
        if self.cuda:
            self.model = self.model.cuda()

        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer(vocab_file=vocab_file,
                                       do_lower_case=lower)

        self.CLS = '[CLS]'
        self.SEP = '[SEP]'
        self.MASK = '[MASK]'
        self.mask_id = self.tokenizer.convert_tokens_to_ids([self.MASK])[0]
        self.sep_id = self.tokenizer.convert_tokens_to_ids([self.SEP])[0]
        self.cls_id = self.tokenizer.convert_tokens_to_ids([self.CLS])[0]

    def tokenize_batch(self, batch):
        return [self.tokenizer.convert_tokens_to_ids(sent) for sent in batch]

    def untokenize_batch(self, batch):
        return [self.tokenizer.convert_ids_to_tokens(sent) for sent in batch]

    def detokenize(self, sent):
        """ Roughly detokenizes (mainly undoes wordpiece) """
        new_sent = []
        for i, tok in enumerate(sent):
            if tok.startswith("##"):
                new_sent[len(new_sent) -
                         1] = new_sent[len(new_sent) - 1] + tok[2:]
            else:
                new_sent.append(tok)
        return new_sent

    def printer(self, sent, should_detokenize=True):
        if should_detokenize:
            sent = self.detokenize(sent)[1:-1]
        print(" ".join(sent))

    def predict_masked(self, sent):
        tokens = ['[CLS]'] + sent + ['[SEP]']
        target_indices = [i for i, x in enumerate(tokens) if x == '[MASK]']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        tens = torch.LongTensor(input_ids).unsqueeze(0)
        if self.cuda:
            tens = tens.cuda()
        try:
            res = self.model(tens)[0]
        except RuntimeError:  # Error in the model vocabulary, remove when a corret model is trained
            return None
        target_tensor = torch.LongTensor(target_indices)
        if self.cuda:
            target_tensor = target_tensor.cuda()
        res = (torch.index_select(res, 1, target_tensor))
        res = torch.narrow(torch.argsort(res, dim=-1, descending=True), -1, 0,
                           5)

        predicted = []
        for mask in res[0, ]:
            candidates = self.tokenizer.convert_ids_to_tokens(
                [i.item() for i in mask])

            predicted.append(candidates)

        return predicted