def bertForMaskedLM(*args, **kwargs):
    """
    BertForMaskedLM includes the BertModel Transformer followed by the
    (possibly) pre-trained masked language modeling head.

    Example:
        # Load the tokenizer
        >>> import torch
        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
        >>> masked_index = 8
        >>> tokenized_text[masked_index] = '[MASK]'
        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
        >>> model.eval()
        # Predict all tokens
        >>> with torch.no_grad():
                predictions = model(tokens_tensor, segments_tensors)
        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
        'henson'
    """
    model = BertForMaskedLM.from_pretrained(*args, **kwargs)
    return model
示例#2
0
def load_bertmodel(modelname):
    if modelname:
        tokenizer = BertJapaneseTokenizer.from_pretrained(modelname)
        model = BertForMaskedLM.from_pretrained(modelname)
    else:
        tokenizer, model = None, None
    return tokenizer, model
示例#3
0
    def __init__(self, args: argparse.Namespace):
        super().__init__()
        self.args = args
        self.bert_config = BertConfig.from_pretrained(self.args.bert_path)
        self.model = BertForMaskedLM(self.bert_config)
        self.loss_fn = CrossEntropyLoss(reduction="none")

        self.train_acc = MaskedAccuracy()
        self.valid_acc = MaskedAccuracy()
示例#4
0
    def __init__(self, device, keyword_model_file, model_file=None, n_kws=15):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.vocab_size = self.tokenizer.vocab_size
        self.n_kws = n_kws

        self.mask_id = 103
        self.sep_id = 102

        self.kw_ex = KeywordExtractor(n_kws=self.n_kws)
        self.kw_ex.reload(keyword_model_file)
        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
        self.device = device
        self.model.to(self.device)
        if model_file is not None:
            self.reload_model(model_file)
示例#5
0
    def __init__(self, device, model_file=None, n_kws=15):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.vocab_size = self.tokenizer.vocab_size
        self.n_kws = n_kws

        self.mask_id = 103
        self.sep_id = 102

        self.kw_ex = KeywordExtractor(n_kws=self.n_kws)
        #    self.kw_ex.reload(keyword_model_file)
        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
        self.device = device
        self.model.to(self.device)
        if model_file is not None:
            print("Model:", "[{}]".format(model_file.split('/')[-1]),
                  "is loaded.")
            self.reload_model(model_file)
示例#6
0
import torch
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer
from transformers.modeling_bert import BertForMaskedLM, BertConfig
import MeCab

# Load the models
tokenizer = BertJapaneseTokenizer.from_pretrained('model/')
config = BertConfig.from_json_file('model/bert_base_32k_config.json')
model = BertForMaskedLM.from_pretrained('model/model.ckpt-580000_pytorch.bin',
                                        config=config)
m = MeCab.Tagger("-Ochasen")


def sent_emb(text):
    print('text:', text)
    input_ids = tokenizer.encode(text, return_tensors='pt')
    print('tokenizer.conert:',
          tokenizer.convert_ids_to_tokens(input_ids[0].tolist()))

    masked_index = torch.where(
        input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    print('masked index:', masked_index)
    result = model(input_ids)
    pred_ids = result[0][:, masked_index].topk(10).indices.tolist()[0]

    output = []
    for pred_id in pred_ids:
        output_ids = input_ids.tolist()[0]
        output_ids[masked_index] = pred_id
        text = ''.join(tokenizer.decode(output_ids))
        #print(text)
示例#7
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model: str = None,
                 requires_grad: bool = True,
                 predictions_file=None,
                 layer_freeze_regexes: List[str] = None,
                 probe_type: str = None,
                 loss_on_all_vocab: bool = False,
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._loss_on_all_vocab = loss_on_all_vocab

        self._predictions_file = predictions_file

        # TODO move to predict
        if predictions_file is not None and os.path.isfile(predictions_file):
            os.remove(predictions_file)

        self._pretrained_model = pretrained_model
        if 'roberta' in pretrained_model:
            self._padding_value = 1  # The index of the RoBERTa padding token
            if loss_on_all_vocab:
                self._transformer_model = RobertaForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = RobertaForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
        elif 'xlnet' in pretrained_model:
            self._padding_value = 5  # The index of the XLNet padding token
            self._transformer_model = XLNetLMHeadModel.from_pretrained(
                pretrained_model)
        elif 'albert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = AlbertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        elif 'bert' in pretrained_model:
            if loss_on_all_vocab:
                self._transformer_model = BertForMaskedLM.from_pretrained(
                    pretrained_model)
            else:
                self._transformer_model = BertForMultiChoiceMaskedLM.from_pretrained(
                    pretrained_model)
            self._padding_value = 0  # The index of the BERT padding token
        else:
            assert (ValueError)

        if probe_type == 'MLP':
            layer_freeze_regexes = ["embeddings", "encoder", "pooler"]
        elif probe_type == 'linear':
            layer_freeze_regexes = [
                "embeddings", "encoder", "pooler", "dense", "LayerNorm",
                "layer_norm"
            ]

        for name, param in self._transformer_model.named_parameters():
            if layer_freeze_regexes and requires_grad:
                grad = not any(
                    [bool(re.search(r, name)) for r in layer_freeze_regexes])
            else:
                grad = requires_grad
            if grad:
                param.requires_grad = True
            else:
                param.requires_grad = False

        # make sure decode gredients are on.
        if 'roberta' in pretrained_model:
            self._transformer_model.lm_head.decoder.weight.requires_grad = True
            self._transformer_model.lm_head.bias.requires_grad = True
        elif 'albert' in pretrained_model:
            pass
        elif 'bert' in pretrained_model:
            self._transformer_model.cls.predictions.decoder.weight.requires_grad = True
            self._transformer_model.cls.predictions.bias.requires_grad = True

        transformer_config = self._transformer_model.config
        transformer_config.num_labels = 1
        self._output_dim = self._transformer_model.config.hidden_size

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.CrossEntropyLoss()
        self._debug = 2
示例#8
0
def run_pplm_example_bert(pretrained_model="bert-base-cased",
                          mask_prob=0.5,
                          do_selective_mask=True,
                          cond_text="",
                          num_samples=1,
                          bag_of_words=None,
                          discrim=None,
                          discrim_weights=None,
                          discrim_meta=None,
                          class_label=-1,
                          length=100,
                          stepsize=0.02,
                          temperature=1.0,
                          top_k=10,
                          sample=True,
                          num_iterations=3,
                          grad_length=10000,
                          horizon_length=1,
                          decay=False,
                          gamma=1.5,
                          gm_scale=0.9,
                          kl_scale=0.01,
                          seed=0,
                          no_cuda=False,
                          colorama=False,
                          strategy='pick_best',
                          verbosity='regular',
                          return_sent=False):
    ##This is the main function for bert

    # set Random seed
    torch.manual_seed(seed)
    np.random.seed(seed)

    # set verbosiry
    verbosity_level = VERBOSITY_LEVELS.get(verbosity.lower(), REGULAR)

    # set the device
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    ###set discriminator? TODO need to figure this part out. where is this used
    #Modifies the global variables
    if discrim == 'generic':
        set_generic_model_params(discrim_weights, discrim_meta)
    if discrim is not None:
        discriminator_pretrained_model = DISCRIMINATOR_MODELS_PARAMS[discrim][
            "pretrained_model"]
        if pretrained_model != discriminator_pretrained_model:
            pretrained_model = discriminator_pretrained_model
            if verbosity_level >= REGULAR:
                print("discrim = {}, pretrained_model set "
                      "to discriminator's = {}".format(discrim,
                                                       pretrained_model))

    # load pretrained model
    model = BertForMaskedLM.from_pretrained(pretrained_model,
                                            output_hidden_states=True)
    model.to(device)
    model.eval()

    # load tokenizer
    tokenizer = BertTokenizer.from_pretrained(pretrained_model)

    # Freeze Bert weights
    #!! this is interesting, i should also do this for my code.
    for param in model.parameters():
        param.requires_grad = False

    # figure out conditioning text
    # if uncond, use start of sentence as the prompt
    # we need to change this into a whole sentence

    raw_text = cond_text

    while not raw_text:
        print("Did you forget to add `--cond_text`? ")
        raw_text = input("Model prompt >>> ")
    ##Different: we are also adding eos token now (as opposed to only bos)
    tokenized_cond_text = tokenizer.encode(raw_text)

    print("= Original sentence =")
    print(tokenizer.decode(tokenized_cond_text))
    print()
    #randomly mask out a certain percentage of tokens or do_selective
    sent_len = len(tokenized_cond_text) - 2
    # masked_indices = np.random.choice( range(1, len(tokenized_cond_text)-1), int(sent_len * mask_prob))

    #add a function to mask out indices that
    if discrim is not None and do_selective_mask:
        classifier, class_id = get_classifier(discrim, class_label, device)
        masked_indices, init_score, masked_score = selective_mask(
            raw_text, mask_prob, model, tokenizer, classifier, class_id,
            device, strategy)
    orig_scores = [
        init_score,
        masked_score,
    ]

    # masked_indices = np.array([5,6,7])

    # get the mask labels
    # F**k they changed the ignore_index!!!!
    masked_lm_labels = [[-100 for _ in range(len(tokenized_cond_text))]]
    for ind in masked_indices:
        masked_lm_labels[0][ind] = tokenized_cond_text[ind]
    masked_lm_labels = torch.tensor(masked_lm_labels,
                                    device=device,
                                    dtype=torch.long)
    for ind in masked_indices:
        tokenized_cond_text[ind] = tokenizer.convert_tokens_to_ids(
            tokenizer.mask_token)
    #PRINT the masked version of the input_text
    print("After masking")
    masked_text = tokenizer.decode(tokenized_cond_text)
    print(masked_text)

    # generate unperturbed and perturbed texts

    # full_text_generation returns:
    # unpert_gen_tok_text, pert_gen_tok_texts, discrim_losses, losses_in_time
    # bert-completed sentence without perterbing
    unpert_gen_tok_text, pert_gen_tok_texts, _, _ = full_text_generation_bert(
        model=model,
        tokenizer=tokenizer,
        context=tokenized_cond_text,
        masked_indices=masked_indices,
        masked_lm_labels=masked_lm_labels,
        device=device,
        num_samples=num_samples,
        bag_of_words=bag_of_words,
        discrim=discrim,
        class_label=class_label,
        stepsize=stepsize,
        temperature=temperature,
        top_k=top_k,
        sample=sample,
        num_iterations=num_iterations,
        grad_length=grad_length,
        horizon_length=horizon_length,
        decay=decay,
        gamma=gamma,
        gm_scale=gm_scale,
        kl_scale=kl_scale,
        verbosity_level=verbosity_level)

    # untokenize unperturbed text
    print('UNPERT\n')
    unpert_gen_text = tokenizer.decode(unpert_gen_tok_text)

    if verbosity_level >= REGULAR:
        print("=" * 80)
    print("= Unperturbed generated text =")
    print(unpert_gen_text)
    print()

    generated_texts = []

    bow_word_ids = set()
    if bag_of_words and colorama:
        bow_indices = get_bag_of_words_indices_bert(bag_of_words.split(";"),
                                                    tokenizer)
        for single_bow_list in bow_indices:
            # filtering all words in the list composed of more than 1 token
            filtered = list(filter(lambda x: len(x) <= 1, single_bow_list))
            # w[0] because we are sure w has only 1 item because previous fitler
            bow_word_ids.update(w[0] for w in filtered)
    # iterate through the perturbed texts
    for i, pert_gen_tok_text in enumerate(pert_gen_tok_texts):
        try:
            # untokenize unperturbed text
            if colorama:
                import colorama

                pert_gen_text = ''
                for word_id in pert_gen_tok_text.tolist()[0]:
                    if word_id in bow_word_ids:
                        pert_gen_text += '{}{}{}'.format(
                            colorama.Fore.RED, tokenizer.decode([word_id]),
                            colorama.Style.RESET_ALL)
                    else:
                        pert_gen_text += tokenizer.decode([word_id])
            else:
                pert_gen_text = tokenizer.decode(pert_gen_tok_text)

            print("= Perturbed generated text {} =".format(i + 1))
            print(pert_gen_text)
            print()
        except:
            pass
        # keep the prefix, perturbed seq, original seq for each index
        # return should contain: masked sentence, pert_gen_text, unpert_gen_text
        # scores = [initial_score, score_after_masking, score_after_filling_in]
        new_score = get_score(pert_gen_tok_text, model, classifier, device)
        generated_texts.append((pert_gen_text, unpert_gen_text, new_score))
    if return_sent:
        return [masked_text, orig_scores, generated_texts]
    return
def train_cbert_and_augment(args):
    task_name = args.task_name
    os.makedirs(args.output_dir, exist_ok=True)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True

    os.makedirs(args.output_dir, exist_ok=True)
    processor = get_task_processor(task_name, args.data_dir)
    label_list = processor.get_labels(task_name)

    # load train and dev data
    train_examples = processor.get_train_examples()
    dev_examples = processor.get_dev_examples()

    tokenizer = BertTokenizer.from_pretrained(BERT_MODEL,
                                              do_lower_case=True,
                                              cache_dir=args.cache)

    model = BertForMaskedLM.from_pretrained(BERT_MODEL, cache_dir=args.cache)

    if len(label_list) > 2:
        model.bert.embeddings.token_type_embeddings = torch.nn.Embedding(
            len(label_list), 768)
        model.bert.embeddings.token_type_embeddings.weight.data.normal_(
            mean=0.0, std=0.02)

    model.to(device)

    # train data
    train_features = convert_examples_to_features(train_examples, label_list,
                                                  args.max_seq_length,
                                                  tokenizer, args.seed)
    train_data = prepare_data(train_features)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    #dev data
    dev_features = convert_examples_to_features(dev_examples, label_list,
                                                args.max_seq_length, tokenizer,
                                                args.seed)
    dev_data = prepare_data(dev_features)
    dev_sampler = SequentialSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=args.train_batch_size)

    num_train_steps = int(
        len(train_features) / args.train_batch_size * args.num_train_epochs)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_features))
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)

    # Prepare optimizer
    t_total = num_train_steps
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=1e-8)

    best_dev_loss = float('inf')
    for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
        avg_loss = 0.
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {
                'input_ids': batch[1],
                'attention_mask': batch[2],
                'token_type_ids': batch[3],
                'masked_lm_labels': batch[4]
            }

            outputs = model(**inputs)
            loss = outputs[0]
            optimizer.zero_grad()
            loss.backward()
            avg_loss += loss.item()
            optimizer.step()

            if (step + 1) % 50 == 0:
                print("avg_loss: {}".format(avg_loss / 50))
            avg_loss = 0.

        # eval on dev after every epoch
        dev_loss = compute_dev_loss(model, dev_dataloader)
        print("Epoch {}, Dev loss {}".format(epoch, dev_loss))
        if dev_loss < best_dev_loss:
            best_dev_loss = dev_loss
            print("Saving model. Best dev so far {}".format(best_dev_loss))
            save_model_path = os.path.join(args.output_dir, 'best_cbert.pt')
            torch.save(model.state_dict(), save_model_path)

    # augment data using the best model
    augment_train_data(model, tokenizer, train_data, label_list, args)
示例#10
0
    def __init__(self,
                 model,
                 train_dataset,
                 trainer_config,
                 writer,
                 logger=None,
                 test_dataset=None,
                 valid_dataset=None,
                 n_jobs=0,
                 label_smoothing=0,
                 device=torch.device('cuda'),
                 evaluate_full_sequences=False,
                 ignore_idxs=[],
                 full_input=False,
                 max_length=511,
                 max_y_length=80,
                 new_dataset=False,
                 best_model_path='',
                 no_persona=False,
                 mixup=False,
                 mixup_mode='alternate',
                 mixup_dataset=None,
                 mixup_ratio=0.15,
                 bert_mixup=False,
                 replace=False,
                 pointer_gen=False):
        if logger is None:
            self.logger = logging.getLogger(__file__)
        else:
            self.logger = logger

        self.train_batch_size = trainer_config.train_batch_size
        self.test_batch_size = trainer_config.test_batch_size
        self.lr = trainer_config.lr
        self.lr_warmup = trainer_config.lr_warmup
        self.weight_decay = trainer_config.weight_decay
        self.batch_split = trainer_config.batch_split
        self.s2s_weight = 1
        self.single_input = True
        self.clip_grad = trainer_config.clip_grad
        self.n_epochs = trainer_config.n_epochs
        self.linear_schedule = trainer_config.linear_schedule
        self.patience = trainer_config.patience
        self.model_saving_interval = trainer_config.model_saving_interval
        self.device = device
        self.no_persona = no_persona
        self.evaluate_full_sequences = evaluate_full_sequences
        self.global_step = 0
        self.full_input = full_input
        self.max_length = max_length
        self.max_y_length = max_y_length
        self.new_dataset = new_dataset
        self.best_ppl = 1e5
        self.best_model_path = best_model_path
        self.mixup_mode = mixup_mode
        self.replace = replace
        self.mixup = mixup
        self.mixup_dataset = mixup_dataset
        self.mixup_ratio = mixup_ratio
        self.model_type = 'pretrain'
        self.patience_cnt = 0
        self.stop_training = False
        self.pointer_gen = pointer_gen

        self.model = model.to(device)

        self.criterion = LabelSmoothingLoss(
            n_labels=self.model.n_embeddings,
            smoothing=label_smoothing,
            ignore_index=self.model.padding_idx).to(device)

        param_optimizer = list(self.model.named_parameters())
        # Here we should remove parameters which are not used during to avoid breaking apex with None grads
        self.loss_weight = None
        no_decay = ['bias', 'loss']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            self.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        base_optimizer = Adam(optimizer_grouped_parameters, lr=self.lr)

        if not self.linear_schedule:
            self.optimizer = NoamOpt(self.model.embeddings_size,
                                     self.lr_warmup,
                                     base_optimizer,
                                     lr=self.lr,
                                     linear_schedule=False,
                                     loss_weight=self.loss_weight)
        else:
            total_steps = len(
                train_dataset) * self.n_epochs // self.train_batch_size
            self.optimizer = NoamOpt(self.model.embeddings_size,
                                     self.lr_warmup,
                                     base_optimizer,
                                     linear_schedule=True,
                                     lr=self.lr,
                                     total_steps=total_steps,
                                     loss_weight=self.loss_weight)

        train_sampler = RandomSampler(train_dataset)
        self.train_dataloader = DataLoader(train_dataset,
                                           batch_size=self.train_batch_size //
                                           self.batch_split,
                                           sampler=train_sampler,
                                           num_workers=n_jobs,
                                           collate_fn=self.collate_func)
        self.train_dataset = train_dataset  # used to sample negative examples
        if test_dataset is not None:  # only do evaluation on main process
            self.test_dataloader = DataLoader(test_dataset,
                                              batch_size=self.test_batch_size,
                                              shuffle=False,
                                              num_workers=n_jobs,
                                              collate_fn=self.collate_func)
        if valid_dataset is not None:
            self.valid_dataloader = DataLoader(valid_dataset,
                                               batch_size=self.test_batch_size,
                                               shuffle=False,
                                               num_workers=n_jobs,
                                               collate_fn=self.collate_func)
        self.bert_mixup = bert_mixup
        if bert_mixup:
            self.bert_model = BertForMaskedLM.from_pretrained(
                './bert_model').to(device)
            self.bert_tokenizer = BertTokenizer.from_pretrained('./bert_model')

        self.vocab = train_dataset.vocab
        self.writer = writer

        if isinstance(self.model, TransformerSeq2Seq):
            self.model_type = 'seq2seq'
示例#11
0
import torch
from flask import Flask, request, render_template
from transformers.modeling_bert import BertForMaskedLM
from transformers.tokenization_bert_japanese import BertJapaneseTokenizer

app = Flask(__name__)
model_name = "cl-tohoku/bert-base-japanese-whole-word-masking"
tokenizer = BertJapaneseTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)
model.eval()


def get_prediction(s):
    assert '[MASK]' in s
    input_ids = tokenizer.encode(s, return_tensors="pt")
    masked_index = torch.where(
        input_ids == tokenizer.mask_token_id)[1].tolist()[0]
    with torch.no_grad():
        # (1, seq_len, vocab_size)
        logits, = model(input_ids)
        # (1, vocab_size)
        probs_for_mask = torch.softmax(logits[0, masked_index], dim=-1)
    topk_probs, topk_indices = torch.topk(probs_for_mask, k=10)
    return [tokenizer.decode([i])
            for i in topk_indices.tolist()], topk_probs.tolist()


@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        s = request.form['sent']