示例#1
0
def main(args):
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer.bos_token = tokenizer.cls_token
    tokenizer.eos_token = tokenizer.sep_token
    train_texts, train_labels = read_split('train', args.data_path,
                                           args.context)
    test_texts, test_labels = read_split('test', args.data_path, args.context)

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=.2)

    encoder_max_length = 128
    decoder_max_length = 128

    train_encodings = tokenizer(train_texts,
                                truncation=True,
                                padding=True,
                                max_length=encoder_max_length)
    val_encodings = tokenizer(val_texts,
                              truncation=True,
                              padding=True,
                              max_length=encoder_max_length)
    test_encodings = tokenizer(test_texts,
                               truncation=True,
                               padding=True,
                               max_length=encoder_max_length)

    train_decodings = tokenizer(train_labels,
                                truncation=True,
                                padding=True,
                                max_length=decoder_max_length)
    val_decodings = tokenizer(val_labels,
                              truncation=True,
                              padding=True,
                              max_length=decoder_max_length)
    test_decodings = tokenizer(test_labels,
                               truncation=True,
                               padding=True,
                               max_length=decoder_max_length)

    train_data = ReviewDataset(train_texts, train_labels, train_encodings,
                               train_decodings, tokenizer.pad_token_id)
    val_data = ReviewDataset(val_texts, val_labels, val_encodings,
                             val_decodings, tokenizer.pad_token_id)
    test_data = ReviewDataset(test_texts, test_labels, test_encodings,
                              test_decodings, tokenizer.pad_token_id)

    bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "bert-base-uncased", "bert-base-uncased")
    # set special tokens
    bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
    bert2bert.config.eos_token_id = tokenizer.eos_token_id
    bert2bert.config.pad_token_id = tokenizer.pad_token_id

    # sensible parameters for beam search
    bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
    bert2bert.config.max_length = 142
    bert2bert.config.min_length = 56
    bert2bert.config.no_repeat_ngram_size = 3
    bert2bert.config.early_stopping = True
    bert2bert.config.length_penalty = 2.0
    bert2bert.config.num_beams = 4

    # set training arguments - these params are not really tuned, feel free to change
    batch_size = 10  # change to 16 for full training
    training_args = Seq2SeqTrainingArguments(
        output_dir="./cptk_yelp",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        predict_with_generate=True,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        logging_steps=1000,  # set to 1000 for full training
        save_steps=800,  # set to 500 for full training
        eval_steps=800,  # set to 8000 for full training
        warmup_steps=2000,  # set to 2000 for full training
        overwrite_output_dir=True,
        save_total_limit=10,
        fp16=True,
        num_train_epochs=1000,
    )

    # instantiate trainer
    trainer = Seq2SeqTrainer(
        model=bert2bert,
        args=training_args,
        compute_metrics=build_compute_metrics_fn,
        train_dataset=train_data,
        eval_dataset=val_data,
    )
    trainer.train()
示例#2
0
    add_cross_attention=True,  # add cross attention layers
    vocab_size=len(decoder_tokenizer),
    # Set required tokens.
    unk_token_id=decoder_tokenizer.vocab["[UNK]"],
    sep_token_id=decoder_tokenizer.vocab["[SEP]"],
    pad_token_id=decoder_tokenizer.vocab["[PAD]"],
    cls_token_id=decoder_tokenizer.vocab["[CLS]"],
    mask_token_id=decoder_tokenizer.vocab["[MASK]"],
    #bos_token_id = decoder_tokenizer.vocab["[BOS]"],
    #eos_token_id = decoder_tokenizer.vocab["[EOS]"],
)
# Initialize a brand new bert-based decoder.
decoder = BertGenerationDecoder(config=decoder_config)

# Setup enc-decoder mode.
bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
bert2bert.config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"]
bert2bert.config.pad_token_id = decoder_tokenizer.vocab["[PAD]"]

# Elementary Training.
optimizer = torch.optim.Adam(bert2bert.parameters(), lr=0.000001)
bert2bert.cuda()

for epoch in range(30):
    print("*" * 50, "Epoch", epoch, "*" * 50)
    for batch in tqdm(sierra_dl):
        # tokenize commands and goals.
        inputs = encoder_tokenizer(batch["command"],
                                   add_special_tokens=True,
                                   return_tensors="pt",
                                   padding=True,
示例#3
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )

    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_predict",
        action="store_true",
        help="Whether to run predictions on the test set.",
    )
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.",
    )
    parser.add_argument(
        "--keep_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained with accents.",
    )
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.",
    )
    parser.add_argument(
        "--use_fast",
        action="store_const",
        const=True,
        help="Set this flag to use fast tokenization.",
    )
    parser.add_argument(
        "--train_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--optimizer",
        default="lamb",
        type=str,
        help="Optimizer (AdamW or lamb)",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument(
        "--save_steps",
        type=int,
        default=500,
        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir",
        action="store_true",
        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    # New example based on https://colab.research.google.com/drive/1uVP09ynQ1QUmSE2sjEysHjMfKgo4ssb7?usp=sharing
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('DEVICE: ' + str(device))

    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        'bert-base-cased', 'bert-base-cased')
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

    model.train()
    train_loss_set = []
    train_loss = 0
    save_step = 500

    for epoch in range(int(args.num_train_epochs)):
        batches = tqdm(batch_loader(tokenizer,
                                    args.data_dir,
                                    step='train',
                                    batch_size=args.train_batch_size,
                                    start_pad=False),
                       desc='Training')
        for step, batch in enumerate(batches):
            batch = tuple(t.to(device) for t in batch)
            input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch
            optimizer.zero_grad()
            model.zero_grad()

            loss, outputs = model(input_ids=input_ids_encode,
                                  decoder_input_ids=input_ids_decode,
                                  attention_mask=attention_mask_encode,
                                  decoder_attention_mask=attention_mask_decode,
                                  lm_labels=lm_labels)[:2]

            train_loss_set.append(loss.item())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        print(epoch)
        clear_output(True)
        plt.plot(train_loss_set)
        plt.title(f'Training loss. Epoch {epoch}')
        plt.xlabel(f'Batch {step}')
        plt.ylabel('Loss')
        plt.show()

    print('STARTING EVALUATION')
    model.eval()

    test_batches = tqdm(batch_loader(tokenizer,
                                     args.data_dir,
                                     step='test',
                                     batch_size=1,
                                     start_pad=True),
                        desc='Evaluating')
    for step, batch in enumerate(test_batches):
        batch = tuple(t.to(device) for t in batch)
        input_ids_encode, attention_mask_encode, input_ids_decode, attention_mask_decode, lm_labels = batch
        with torch.no_grad():
            generated = model.generate(
                input_ids_encode,
                attention_mask=attention_mask_encode,
                decoder_start_token_id=model.config.decoder.pad_token_id,
                do_sample=True,
                max_length=10,
                top_k=200,
                top_p=0.75,
                num_return_sequences=10,
                #num_beams=5,
                #no_repeat_ngram_size=2,
            )
            for i in range(len(generated)):
                print(
                    f'Generated {i}: {tokenizer.decode(generated[i], skip_special_tokens=True, clean_up_tokenization_spaces=True)}'
                )

            print(
                'Expected: ', ' '.join([
                    tokenizer.decode(elem,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True)
                    for elem in input_ids_decode
                ]))
            print(
                'Lm Labels: ', ' '.join([
                    tokenizer.decode(elem,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True)
                    for elem in lm_labels
                ]))
            print(
                'Input: ', ' '.join([
                    tokenizer.decode(elem,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=True)
                    for elem in input_ids_encode
                ]))
            print()
示例#4
0
    def create_and_check_encoder_decoder_shared_weights(
        self,
        config,
        input_ids,
        attention_mask,
        encoder_hidden_states,
        decoder_config,
        decoder_input_ids,
        decoder_attention_mask,
        labels,
        **kwargs
    ):
        torch.manual_seed(0)
        encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
        model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
        model.to(torch_device)
        model.eval()
        # load state dict copies weights but does not tie them
        decoder_state_dict = model.decoder._modules[model.decoder.base_model_prefix].state_dict()
        model.encoder.load_state_dict(decoder_state_dict, strict=False)

        torch.manual_seed(0)
        tied_encoder_model, tied_decoder_model = self.get_encoder_decoder_model(config, decoder_config)
        config = EncoderDecoderConfig.from_encoder_decoder_configs(
            tied_encoder_model.config, tied_decoder_model.config, tie_encoder_decoder=True
        )
        tied_model = EncoderDecoderModel(encoder=tied_encoder_model, decoder=tied_decoder_model, config=config)
        tied_model.to(torch_device)
        tied_model.eval()

        model_result = model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        tied_model_result = tied_model(
            input_ids=input_ids,
            decoder_input_ids=decoder_input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
        )

        # check that models has less parameters
        self.assertLess(sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters()))
        random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()

        # check that outputs are equal
        self.assertTrue(
            torch.allclose(
                model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
            )
        )

        # check that outputs after saving and loading are equal
        with tempfile.TemporaryDirectory() as tmpdirname:
            tied_model.save_pretrained(tmpdirname)
            tied_model = EncoderDecoderModel.from_pretrained(tmpdirname)
            tied_model.to(torch_device)
            tied_model.eval()

            # check that models has less parameters
            self.assertLess(
                sum(p.numel() for p in tied_model.parameters()), sum(p.numel() for p in model.parameters())
            )
            random_slice_idx = ids_tensor((1,), model_result[0].shape[-1]).item()

            tied_model_result = tied_model(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
            )

            # check that outputs are equal
            self.assertTrue(
                torch.allclose(
                    model_result[0][0, :, random_slice_idx], tied_model_result[0][0, :, random_slice_idx], atol=1e-4
                )
            )
示例#5
0
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
示例#6
0
 def test_real_bert_model_from_pretrained(self):
     model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-base-uncased", "bert-base-uncased")
     self.assertIsNotNone(model)
示例#7
0
 def __init__(self, model_name, device):
     self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
     self.model = EncoderDecoderModel.from_pretrained(model_name)
     self.model = self.model.to(device)
示例#8
0
    if args.checkpoint != None:
        model_created = True
        if args.bart:
            config = BartConfig.from_json_file(args.checkpoint +
                                               "/config.json")
            model = BartForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        if args.t5:
            config = T5Config.from_json_file(args.checkpoint + "/config.json")
            model = T5ForConditionalGeneration.from_pretrained(
                args.checkpoint + "/pytorch_model.bin", config=config)
        elif not args.bart and not args.t5:
            config = EncoderDecoderConfig.from_json_file(args.checkpoint +
                                                         "/config.json")
            model = EncoderDecoderModel.from_pretrained(args.checkpoint +
                                                        "/pytorch_model.bin",
                                                        config=config)
        model_name = args.checkpoint

    if args.bart:
        if args.checkpoint == None:
            model_name = "WikinewsSum/bart-large-multi-fr-wiki-news" if args.model_name == "" else args.model_name
        tokenizer = BartTokenizer.from_pretrained(
            args.tokenizer
        ) if args.tokenizer != None else BartTokenizer.from_pretrained(
            model_name)
        if not model_created:
            model = BartForConditionalGeneration.from_pretrained(model_name)
            model_created = True

    if args.t5:
        return batch

    def format_rouge_output(rouge_output):
        return {
            "rouge1_precision": round(rouge_output["rouge1"].mid.precision, 4),
            "rouge1_recall": round(rouge_output["rouge1"].mid.recall, 4),
            "rouge1_fmeasure": round(rouge_output["rouge1"].mid.fmeasure, 4),
            "rouge2_precision": round(rouge_output["rouge2"].mid.precision, 4),
            "rouge2_recall": round(rouge_output["rouge2"].mid.recall, 4),
            "rouge2_fmeasure": round(rouge_output["rouge2"].mid.fmeasure, 4),
            "rougeL_precision": round(rouge_output["rougeL"].mid.precision, 4),
            "rougeL_recall": round(rouge_output["rougeL"].mid.recall, 4),
            "rougeL_fmeasure": round(rouge_output["rougeL"].mid.fmeasure, 4)
        }

    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "bert-base-uncased", "bert-base-uncased")
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    tokenizer.bos_token = tokenizer.cls_token
    tokenizer.eos_token = tokenizer.sep_token

    wiki_train_dataset = load_dataset('wikihow',
                                      'all',
                                      data_dir='manual_wikihow_data',
                                      split='train')
    wiki_val_dataset = load_dataset('wikihow',
                                    'all',
                                    data_dir='manual_wikihow_data',
                                    split='validation')

    rouge = load_metric('rouge')
示例#10
0
        return [json.loads(l) for l in istream]


if __name__ == "__main__":
    args = parse_args()

    token_to_index = TokenToIndexConverter(
        "vocab/github_python_minus_ethpy150open_deduplicated_vocabulary.txt")

    set_global_seed(19)

    DATA_FOLDER = Path("data")
    train = read_jsonl(DATA_FOLDER / "train_preprocessed.jsonl")
    test = read_jsonl(DATA_FOLDER / "test_preprocessed.jsonl")

    model = EncoderDecoderModel.from_pretrained(args.model)

    train_dataset = get_method_name_dataset(
        train, token_to_index, token_to_index.pad_index,
        model.encoder.config.max_position_embeddings)
    test_dataset = get_method_name_dataset(
        test, token_to_index, token_to_index.pad_index,
        model.encoder.config.max_position_embeddings)

    DEVICE = torch.device(args.device)
    model.to(DEVICE).eval()

    metrics = []

    with torch.no_grad():
        for i in tqdm(range(len(test_dataset))):
示例#11
0
import streamlit as st

from ..components.fetch import *
from .translation import *

from transformers import BertTokenizerFast, EncoderDecoderModel
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#Se carga el modelo mini2bert con fine tuning en cnn_daily_mail
tokenizer = BertTokenizerFast.from_pretrained('mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization')
model = EncoderDecoderModel.from_pretrained('mrm8488/bert-mini2bert-mini-finetuned-cnn_daily_mail-summarization').to(device)

#Inferencia del modelo de Sumarizacion, por el momento en ingles.
def get_answer(text):
    inputs = tokenizer([text], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    output = model.generate(input_ids, attention_mask=attention_mask)
    return tokenizer.decode(output[0], skip_special_tokens=True)

def main():
    front_up()
    st.title('Sistema de Sumarizacion de texto')
    context  = st.text_area(label="Ingrese el texto a resumir", height=320)
    if st.button("Cargar modelo"):
        context2 = get_answer_es_en(context)
        answer_summ = get_answer(context2)
        answer_summ2 = get_answer_en_es(answer_summ)
        st.text("El resumen del texto es:")
        st.write(answer_summ2)
示例#12
0
    if(perplexity):
        #eval mode.
        mean_nll = loss_sum / count_eles
        ppl = math.exp(mean_nll)
        print("Perplexity: ", datatype, ppl)
    else:
        #training going on
        print("Mean loss", datatype, (epoch_loss / len(dataloader)))

if(globalparams["do_train"]):
    #load model from pretrained/scratch and train it/save it in the provided dir.
    print("TRAIN MODE: ")

    if(globalparams["pretrained"]):
        #load pretrained encoder and pretrained decoder.
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(globalparams['pretrained_path'], globalparams['pretrained_path'])
        print("pretrained model loaded.", globalparams["pretrained_path"])
    else:
        pass

    model.to(device)

    print(f'The model has {count_parameters(model):,} trainable parameters')

    optimizer = optim.Adam(model.parameters(), lr=modelparams['lr'])
    criterion = nn.NLLLoss(ignore_index=de_tokenizer.pad_token_id)

    num_train_batches = len(train_dataloader)
    num_valid_batches = len(valid_dataloader)

    print("num batches: ", num_train_batches, num_valid_batches)
示例#13
0
class PhonetizerModel:

    phon_tokenizer = {
        'e': 7,
        'i': 8,
        'R': 9,
        'a': 10,
        'o': 11,
        't': 12,
        's': 13,
        'l': 14,
        'k': 15,
        'p': 16,
        'm': 17,
        'n': 18,
        'd': 19,
        'y': 20,
        '@': 21,
        'f': 22,
        'z': 23,
        'b': 24,
        '§': 25,
        'v': 26,
        '2': 27,
        '1': 28,
        'Z': 29,
        'g': 30,
        'u': 31,
        'S': 32
    }
    phon_untokenizer = {v: k for k, v in phon_tokenizer.items()}
    char_tokenizer = {
        'e': 7,
        'i': 8,
        'a': 9,
        'r': 10,
        'o': 11,
        's': 12,
        't': 13,
        'n': 14,
        'l': 15,
        'é': 16,
        'c': 17,
        'p': 18,
        'u': 19,
        'm': 20,
        'd': 21,
        '-': 22,
        'h': 23,
        'g': 24,
        'b': 25,
        'v': 26,
        'f': 27,
        'k': 28,
        'y': 29,
        'x': 30,
        'è': 31,
        'ï': 32,
        'j': 33,
        'z': 34,
        'w': 35,
        'q': 36
    }

    def __init__(self, device='cpu', model=None):
        vocabsize = 37
        max_length = 50
        encoder_config = BertConfig(vocab_size=vocabsize,
                                    max_position_embeddings=max_length + 64,
                                    num_attention_heads=4,
                                    num_hidden_layers=4,
                                    hidden_size=128,
                                    type_vocab_size=1)
        encoder = BertModel(config=encoder_config)

        vocabsize = 33
        max_length = 50
        decoder_config = BertConfig(vocab_size=vocabsize,
                                    max_position_embeddings=max_length + 64,
                                    num_attention_heads=4,
                                    num_hidden_layers=4,
                                    hidden_size=128,
                                    type_vocab_size=1,
                                    add_cross_attentions=True,
                                    is_decoder=True)
        decoder_config.add_cross_attention = True
        decoder = BertLMHeadModel(config=decoder_config)

        # Define encoder decoder model
        self.model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
        self.model.to(device)
        self.device = device
        if model is not None:
            self.model.load_state_dict(torch.load(model))

    def phonetize(self, word):
        word = word.replace('à', 'a')
        word = word.replace('û', 'u')
        word = word.replace('ù', 'u')
        word = word.replace('î', 'i')
        word = word.replace('ç', 'ss')
        word = word.replace('ô', 'o')
        word = word.replace('â', 'a')
        word = word.replace('qu', 'k')
        word = word.replace('ê', 'e')
        assert set(word).issubset(set(PhonetizerModel.char_tokenizer.keys()))
        encoded = torch.tensor(
            [0] + [PhonetizerModel.char_tokenizer[p] for p in word] + [2])
        output = self.model.generate(
            encoded.unsqueeze(0).to(self.device),
            max_length=50,
            decoder_start_token_id=0,
            eos_token_id=2,
            pad_token_id=1,
        ).detach().cpu().numpy()[0]
        bound = np.where(output == 2)[0][0] if 2 in output else 1000
        phon_pred = ''.join([
            PhonetizerModel.phon_untokenizer[c] for c in output[:bound]
            if c > 6
        ])
        return phon_pred

    def check_phonetization_error(self, word, phon):
        prediction = self.phonetize(word)[:5]
        score = pairwise2.align.globalms(list(phon[:5]),
                                         list(prediction),
                                         2,
                                         -1,
                                         -1,
                                         -.5,
                                         score_only=True,
                                         gap_char=['-']) / len(phon[:5])
        return score
示例#14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--device',
                        default='0,1,2,3',
                        type=str,
                        required=False,
                        help='设置使用哪些显卡')
    parser.add_argument('--raw_data_path',
                        default='data/train.txt',
                        type=str,
                        required=False,
                        help='原始训练语料')
    parser.add_argument('--output_dir',
                        default='model/',
                        type=str,
                        required=False,
                        help='模型输出路径')
    parser.add_argument('--batch_size',
                        default=1,
                        type=int,
                        required=False,
                        help='模型训练batch大小')
    parser.add_argument('--lr',
                        default=1.5e-4,
                        type=float,
                        required=False,
                        help='学习率')
    parser.add_argument('--epochs',
                        default=5,
                        type=int,
                        required=False,
                        help='训练循环')

    args = parser.parse_args()
    print('args:\n' + args.__repr__())

    raw_data_path = args.raw_data_path
    output_dir = args.output_dir
    batch_size = args.batch_size
    lr = args.lr
    epochs = args.epochs

    # device
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device  # 此处设置程序使用哪些显卡
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('using device:', device)

    # model
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "bert-base-multilingual-cased", "bert-base-multilingual-cased")

    # dataset
    tokenizer = BertTokenizerFast.from_pretrained(
        "bert-base-multilingual-cased")
    dataset = TextDataset(tokenizer, './dataset/train.jsonl')

    # 打印参数量
    num_parameters = 0
    parameters = model.parameters()
    for parameter in parameters:
        num_parameters += parameter.numel()
    print('number of parameters: {}'.format(num_parameters))

    # dataloader
    def pad_collate_fn(batch):
        batch_size = len(batch)
        # find longest sequence
        source_max_len = max(map(lambda x: x['source'].shape[0], batch))
        target_max_len = max(map(lambda x: x['target'].shape[0], batch))
        # pad according to max_len
        ret = {
            'source':
            torch.full((batch_size, source_max_len),
                       tokenizer.pad_token_id,
                       dtype=torch.long),
            'target':
            torch.full((batch_size, target_max_len),
                       tokenizer.pad_token_id,
                       dtype=torch.long)
        }

        for i, sample in enumerate(batch):
            sample_source = sample['source']
            sample_target = sample['target']
            ret['source'][i, :sample_source.numel()] = sample_source
            ret['target'][i, :sample_target.numel()] = sample_target
        return ret

    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             shuffle=True,
                                             collate_fn=pad_collate_fn)

    # optimizer
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=lr,
                                  betas=(0.9, 0.999),
                                  eps=1e-8)

    print('start training')
    for epoch in range(epochs):
        with tqdm(total=len(dataloader), ascii=True) as t:
            for i, sample in enumerate(dataloader):
                optimizer.zero_grad()
                input_ids = sample['source']
                decoder_input_ids = sample['target']
                loss, *args = model(input_ids=input_ids,
                                    decoder_input_ids=decoder_input_ids,
                                    labels=decoder_input_ids)
                # backward
                loss.backward()
                optimizer.step()

                t.set_postfix({'loss': loss.item()})
                t.update(1)
        # save model
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_epoch_dir = os.path.join(output_dir, f'epoch_{str(epoch)}')
        if not os.path.exists(output_epoch_dir):
            os.mkdir(output_epoch_dir)
        torch.save(model.state_dict(),
                   os.path.join(output_epoch_dir, 'model.pth'))
# SPDX-License-Identifier: Apache-2.0
# based on:
# https://huggingface.co/docs/transformers/model_doc/bertgeneration

from transformers import AutoTokenizer, EncoderDecoderModel

# instantiate sentence fusion model
model = EncoderDecoderModel.from_pretrained(
    "google/roberta2roberta_L-24_discofuse")
tokenizer = AutoTokenizer.from_pretrained(
    "google/roberta2roberta_L-24_discofuse")

input_ids = tokenizer(
    'This is the first sentence. This is the second sentence.',
    add_special_tokens=False,
    return_tensors="pt").input_ids

greedy_output = model.generate(input_ids)

print(f"Output ({greedy_output.shape}): {greedy_output}")
print(
    f"Detokenized: `{tokenizer.decode(greedy_output[0], skip_special_tokens=False)}`"
)
示例#16
0
class TracedEncoderDecoder(BaseModel):
    def __init__(self, config):
        super().__init__(config)
        self.build()

    @classmethod
    def config_path(cls):
        return "configs/models/ted/defaults.yaml"

    def build(self):

        # to be further set
        # breakpoint()
        self.image_feature_module = build_image_encoder(
            self.config.image_feature_processor, direct_features=True)
        if self.config.concate_trace:
            self.trace_feature_module = build_encoder(
                self.config.trace_feature_encoder)

        if self.config.base_model_name == "bert-base-uncased":
            self.encoderdecoder = EncoderDecoderModel.from_encoder_decoder_pretrained(
                "bert-base-uncased", "bert-base-uncased")
        elif self.config.base_model_name == "2layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 2
            config_decoder.num_hidden_layers = 2
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        elif self.config.base_model_name == "3layer-base":
            config_encoder = BertConfig()
            config_decoder = BertConfig()
            config_encoder.num_hidden_layers = 3
            config_decoder.num_hidden_layers = 3
            self.codec_config = EncoderDecoderConfig.from_encoder_decoder_configs(
                config_encoder, config_decoder)
            self.encoderdecoder = EncoderDecoderModel(config=self.codec_config)
        if self.config.loop_contrastive:
            self.trace_caption_contrastive = TraceCaptionContrastiveModel(
                self.config.tc_contrastive_aggregate_method)
        if (hasattr(self.config, "pretrans_attention")
                and self.config.pretrans_attention):

            # import ipdb; ipdb.set_trace()
            tempconf = self.encoderdecoder.config.encoder
            num_heads = tempconf.num_attention_heads
            num_layers = tempconf.num_hidden_layers
            self.attention_trans = AttentionTransform(num_layers, num_heads,
                                                      100)
        self.BOS_ID = 101

    def forward(self, sample_list, *args, **kwargs):

        # breakpoint()
        decoder_input_ids = sample_list["input_ids"][:, :-1]
        # using default mask
        # target_mask = sample_list["input_mask"]
        # segment_ids = sample_list["segment_ids"]
        # token_attends = sample_list["token_attends"]
        other_kwargs = {}
        if self.config.image_feature_processor.type == "spatial":
            bbox_feature = sample_list["image_feature_0"]
            spatial_feature = sample_list["image_info_0"]["bbox"]
            inputs_embeds = self.image_feature_module(bbox_feature,
                                                      spatial_feature)
        else:
            bbox_feature = sample_list["image_feature_0"]
            inputs_embeds = self.image_feature_module(bbox_feature)
        if hasattr(self.config, "no_vision") and self.config.no_vision:
            inputs_embeds = inputs_embeds * 0
        batch_size = inputs_embeds.shape[0]
        if self.config.concate_trace:
            trace_boxes = sample_list["trace_boxes"]
            trace_boxes_mask = sample_list["trace_boxes_mask"]
            trace_feature = self.trace_feature_module(trace_boxes)
            trace_seg_id = sample_list["trace_boxes_seg_id"]
            inputs_embeds = torch.cat((inputs_embeds, trace_feature), dim=1)
            image_feats_mask = trace_boxes_mask.new_ones((batch_size, 100))
            image_feats_seg_id = trace_seg_id.new_zeros((batch_size, 100))
            attention_mask = torch.cat((image_feats_mask, trace_boxes_mask),
                                       dim=1)
            token_type_ids = torch.cat((image_feats_seg_id, trace_seg_id),
                                       dim=1)
            position_ids = trace_seg_id.new_zeros(
                (batch_size, attention_mask.shape[1]))
            other_kwargs.update({
                "attention_mask": attention_mask,
                "token_type_ids": token_type_ids,
                "position_ids": position_ids,
            })

        if self.training:
            decoder_output = self.encoderdecoder(
                decoder_input_ids=decoder_input_ids,
                inputs_embeds=inputs_embeds,
                output_attentions=True,
                output_hidden_states=True,
                return_dict=True,
                **other_kwargs)

            logits = decoder_output["logits"]
            cross_attentions = []
            # import ipdb; ipdb.set_trace()
            for cross_attention in decoder_output["cross_attentions"]:
                if self.config.concate_trace:
                    cross_attention = cross_attention[:, :, :, :100]
                # cross_attentions.append(cross_attention.mean(dim=1))
                cross_attentions.append(cross_attention)
            # breakpoint()
            if (hasattr(self.config, "pretrans_attention")
                    and self.config.pretrans_attention):
                cross_attentions = self.attention_trans(cross_attentions)
            else:
                cross_attentions = [
                    crs.mean(dim=1) for crs in cross_attentions
                ]
            model_output = {}
            model_output["captions"] = torch.max(logits, dim=-1)[1]
            model_output["scores"] = logits
            model_output["cross_attentions"] = cross_attentions
            sample_list["targets"] = sample_list["input_ids"][:, 1:]

            if self.config.loop_contrastive:
                cap_feat, vision_trace_feat = self.trace_caption_contrastive(
                    decoder_output["encoder_hidden_states"][-1],
                    sample_list["trace_boxes_loop_contrastive_seg_id"],
                    decoder_output["decoder_hidden_states"][-1],
                    sample_list["segment_ids"],
                )
                model_output["contrastive_a"] = cap_feat
                model_output["contrastive_b"] = vision_trace_feat
        else:
            if self.config.inference.type == "beam_search":
                generate_output = self.encoderdecoder.generate(
                    input_ids=None,
                    input_embeds=inputs_embeds,
                    bos_token_id=self.BOS_ID,
                    decoder_start_token_id=self.BOS_ID,
                    **self.config.inference.args,
                    **other_kwargs)
            elif self.config.inference.type == "greedy":
                generate_output = self.encoderdecoder.generate(
                    input_ids=None,
                    input_embeds=inputs_embeds,
                    max_length=self.config.max_gen_length,
                    bos_token_id=self.BOS_ID,
                    decoder_start_token_id=self.BOS_ID,
                    **other_kwargs)
            elif self.config.inference.type == "nucleus_sampling":
                generate_output = self.encoderdecoder.generate(
                    input_ids=None,
                    input_embeds=inputs_embeds,
                    bos_token_id=self.BOS_ID,
                    decoder_start_token_id=self.BOS_ID,
                    **self.config.inference.args,
                    **other_kwargs)
            model_output = {}
            # breakpoint()
            if ("return_attention" in self.config.inference
                    and self.config.inference.return_attention):
                with torch.no_grad():
                    attention_temp_output = self.encoderdecoder(
                        decoder_input_ids=generate_output,
                        inputs_embeds=inputs_embeds,
                        output_attentions=True,
                        return_dict=True,
                    )
                    cross_attentions = []
                    for cross_attention in attention_temp_output[
                            "cross_attentions"]:
                        if self.config.concate_trace:
                            cross_attention = cross_attention[:, :, :, :100]
                        cross_attentions.append(cross_attention.mean(dim=1))
                    # breakpoint()
                    cross_attentions = (torch.stack(cross_attentions).max(
                        dim=0)[0].max(dim=-1)[1])
                    model_output["cross_attention"] = cross_attentions
                # breakpoint()

            model_output["captions"] = generate_output
            model_output["losses"] = {}
            loss_key = "{}/{}".format(sample_list.dataset_name,
                                      sample_list.dataset_type)
            # Add a dummy loss so that loss calculation is not required
            model_output["losses"][loss_key + "/dummy_loss"] = torch.zeros(
                batch_size, device=sample_list.image_feature_0.device)
            # breakpoint()

        return model_output
示例#17
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=4,
                gpu_id=0,
                lr=1e-5,
                load_dir='/content/GPT CheckPoints/'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    # ------------------------LOAD MODEL-----------------
    print('load the model....')
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        "gpt2", "gpt2", use_cache=False)

    model.load_state_dict(
        torch.load("/content/EnglishGPT/decoder_model/model2.pth",
                   map_location='cuda'))

    model = model.to(device)

    print('load success')
    # ------------------------END LOAD MODEL--------------

    # ------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("/content/train_data.pth")
    train_dataset = TensorDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size)
    val_data = torch.load("/content/validate_data.pth")
    val_dataset = TensorDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size)
    # ------------------------END LOAD TRAIN DATA--------------

    # ------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(
        optimizer_grouped_parameters, \
        lr=lr, \
        weight_decay=0.01,
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, \
        num_warmup_steps=num_train_optimization_steps // 10, \
        num_training_steps=num_train_optimization_steps
    )

    # ------------------------END SET OPTIMIZER--------------

    # ------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        # ------------------------training------------------------
        model.train()
        losses = 0
        times = 0

        print('\n' + '-' * 20 + f'epoch {epoch}' + '-' * 20)
        for batch in tqdm(train_dataloader):
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = model.encoder(input_ids=encoder_input,
                                    attention_mask=mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = model.decoder(decoder_input,
                                      attention_mask=mask,
                                      past=list(past))

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

        end = time.time()
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        # ------------------------validate------------------------
        model.eval()

        perplexity = 0
        batch_count = 0
        print('\nstart calculate the perplexity....')

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                batch = [item.to(device) for item in batch]

                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                _, past = model.encoder(input_ids=encoder_input,
                                        attention_mask=mask_encoder_input)

                mask = torch.cat([mask_encoder_input, mask_decoder_input],
                                 dim=1)
                logits, _ = model.decoder(decoder_input,
                                          attention_mask=mask,
                                          past=list(past))

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()
                # print(out.shape,target.shape,target_mask.shape)
                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'\nvalidate perplexity: {perplexity / batch_count}')

        torch.save(
            model.state_dict(),
            os.path.join(os.path.abspath('.'), load_dir,
                         "model-" + str(epoch) + ".pth"))
    def __init__(self, cfg: DictConfig, trainer: Trainer = None):

        # must assign tokenizers before init
        if cfg.language_model.pretrained_model_name:
            if cfg.language_model.pretrained_encoder_model_name or cfg.language_model.pretrained_decoder_model_name:
                raise ValueError(
                    "Must have either pretrained_model_name or both pretrained_encoder_model name and "
                    "pretrained_decoder_model_name.")
            # setup tokenizer
            self.encoder_tokenizer = self.setup_tokenizer(
                cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            # set decoder to encoder
            self.decoder_tokenizer = self.encoder_tokenizer
            self.decoder_add_special_tokens = self.encoder_add_special_tokens
        else:
            if not (cfg.language_model.pretrained_encoder_model_name
                    and cfg.language_model.pretrained_decoder_model_name):
                raise ValueError("Both encoder and decoder must be specified")

            # setup tokenizers
            self.encoder_tokenizer = self.setup_tokenizer(
                cfg.encoder_tokenizer)
            self.encoder_add_special_tokens = cfg.encoder_tokenizer.add_special_tokens

            self.decoder_tokenizer = self.setup_tokenizer(
                cfg.decoder_tokenizer)
            self.decoder_add_special_tokens = cfg.decoder_tokenizer.add_special_tokens

        if not self.encoder_tokenizer:
            raise TypeError("encoder_tokenizer failed to initialize")
        if not self.decoder_tokenizer:
            raise TypeError("decoder_tokenizer failed to initialize")

        # init superclass
        super().__init__(cfg=cfg, trainer=trainer)

        # must assign modules after init
        if cfg.language_model.pretrained_model_name:
            # Setup end-to-end model
            if "bart" in cfg.language_model.pretrained_model_name:
                self.model = BartForConditionalGeneration.from_pretrained(
                    cfg.language_model.pretrained_model_name)
            else:
                self.model = AutoModel.from_pretrained(
                    cfg.language_model.pretrained_model_name)
        else:
            if not (cfg.language_model.pretrained_encoder_model_name
                    and cfg.language_model.pretrained_decoder_model_name):
                raise ValueError("Both encoder and decoder must be specified")

            # Setup encoder/decoder model
            self.model = EncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder=cfg.language_model.pretrained_encoder_model_name,
                decoder=cfg.language_model.pretrained_decoder_model_name,
            )

        self.validation_perplexity = Perplexity(compute_on_step=False)

        self.setup_optimization(cfg.optim)
示例#19
0
 def test_real_bert_model_from_pretrained_has_cross_attention(self):
     model = EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-base-uncased", "bert-base-uncased")
     self.assertTrue(
         hasattr(model.decoder.bert.encoder.layer[0], "crossattention"))
    def check_save_and_load(self, config, input_ids, attention_mask,
                            encoder_hidden_states, decoder_config,
                            decoder_input_ids, decoder_attention_mask,
                            **kwargs):
        encoder_model, decoder_model = self.get_encoder_decoder_model(
            config, decoder_config)
        enc_dec_model = EncoderDecoderModel(encoder=encoder_model,
                                            decoder=decoder_model)
        enc_dec_model.to(torch_device)
        enc_dec_model.eval()
        with torch.no_grad():
            outputs = enc_dec_model(
                input_ids=input_ids,
                decoder_input_ids=decoder_input_ids,
                attention_mask=attention_mask,
                decoder_attention_mask=decoder_attention_mask,
            )
            out_2 = outputs[0].cpu().numpy()
            out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmpdirname:
                enc_dec_model.save_pretrained(tmpdirname)
                enc_dec_model = EncoderDecoderModel.from_pretrained(tmpdirname)
                enc_dec_model.to(torch_device)

                after_outputs = enc_dec_model(
                    input_ids=input_ids,
                    decoder_input_ids=decoder_input_ids,
                    attention_mask=attention_mask,
                    decoder_attention_mask=decoder_attention_mask,
                )
                out_1 = after_outputs[0].cpu().numpy()
                out_1[np.isnan(out_1)] = 0
                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)
示例#21
0
def evaluate_style_gen_title(
    existing_run_name: str,
    existing_run_id: str,
    config_file: str,
    gen_model_file: str,
    discr_model_file: str,
    test_file: str,
    test_sample_rate: float,
):
    logging.set_verbosity_info()
    init_wandb(existing_run_name, None, existing_run_id)

    config = json.loads(jsonnet_evaluate_file(config_file))

    tokenizer_model_path = config["tokenizer_model_path"]
    tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path, do_lower_case=False, do_basic_tokenize=False)

    max_tokens_text = config["max_tokens_text"]
    max_tokens_title = config["max_tokens_title"]
    setattr(tokenizer, 'max_tokens_text', max_tokens_text)

    batch_size = config["batch_size"]

    print("Loading model...")
    model = EncoderDecoderModel.from_pretrained(gen_model_file)
    model.eval()
    model.cuda()

    agency_list = config['agency_list']
    discriminator = AutoModelForSequenceClassification.from_pretrained(discr_model_file, num_labels=len(agency_list)).cuda()
    
    print("Fetching TG data...")
    test_records = [r for r in tqdm.tqdm(tg_reader(test_file)) 
        if random.random() <= test_sample_rate]
    
    print("Building datasets...")
    
    
    agency_to_special_token_id = {
        a: tokenizer.vocab[f'[unused{i+1}]'] for i, a in enumerate(agency_list)
    }

    agency_to_target = {a: i for i, a in enumerate(sorted(agency_list))}

    test_dataset = AgencyTitleDatasetGeneration(
        test_records, tokenizer,
        filter_agencies=list(agency_to_special_token_id.keys()),
        agency_to_special_token_id=agency_to_special_token_id,
        max_tokens_text=max_tokens_text, max_tokens_title=max_tokens_title
    )

    print('Dataset size:', len(test_dataset))

    y_pred = []
    y_true = []

    for i in tqdm.trange(0, len(test_dataset), batch_size):
        data = test_dataset[i]
        for k in tuple(data.keys()):
            if k not in ('input_ids', 'attention_mask'):
                del data[k]
            else:
                data[k] = data[k].unsqueeze(0)

        for j in range(i + 1, min(i + batch_size, len(test_dataset))):
            for k in data.keys():
                data[k] = torch.cat((data[k], test_dataset[j][k].unsqueeze(0)), dim=0)

        y_true.extend([ agency_to_target[test_dataset.get_strings(j)['agency']]
            for j in range(i, min(i + batch_size, len(test_dataset)))])

        data['input_ids'] = data['input_ids'].cuda()
        data['attention_mask'] = data['attention_mask'].cuda()

        output_ids = model.generate(
            **data,
            decoder_start_token_id=model.config.decoder.pad_token_id,
            min_length=7,
            max_length=20,
            num_beams=6
        )

        preds = [
            tokenizer.decode(first_sent(x, tokenizer.sep_token_id), skip_special_tokens=True) for x in output_ids
        ]

        for title in preds:
            inp = tokenizer(title, 
                add_special_tokens=True, max_length=max_tokens_title,
                padding='max_length', truncation=True
            )

            logits = discriminator(input_ids=torch.LongTensor(inp['input_ids']).cuda().unsqueeze(0), 
                                   attention_mask=torch.LongTensor(inp['attention_mask']).cuda().unsqueeze(0))[0]
            y_pred.append(torch.argmax(logits).item())

    wandb.summary.update({
        'D-Style': classification_report(y_true, y_pred, output_dict=True)
    })
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "google/bert_for_seq_generation_L-24_bbc_encoder",
         "google/bert_for_seq_generation_L-24_bbc_encoder")
示例#23
0
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained("roberta-base", "roberta-base")
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-large-uncased", "microsoft/prophetnet-large-uncased")
示例#25
0
 def get_from_pretrained(path):
     conf_path = join(dirname(path), "config.json")
     conf = EncoderDecoderConfig.from_pretrained(conf_path)
     model = EncoderDecoderModel.from_pretrained(path, config=conf)
     return model
 def get_pretrained_model(self):
     return EncoderDecoderModel.from_encoder_decoder_pretrained(
         "bert-large-uncased", "facebook/bart-large")
示例#27
0
# In[4]:

tokenizer_model_path = config["tokenizer_model_path"]
tokenizer = BertTokenizer.from_pretrained(tokenizer_model_path,
                                          do_lower_case=False,
                                          do_basic_tokenize=False)

max_tokens_text = config["max_tokens_text"]
max_tokens_title = config["max_tokens_title"]

# In[5]:

enc_model_path = config["enc_model_path"]
dec_model_path = config["dec_model_path"]
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    enc_model_path, dec_model_path)

# In[8]:

from torch.utils.data import Dataset


class StyleModelDataset(Dataset):
    def __init__(
            self,
            path,
            tokenizer,
            agency,  # lenta or ria
            is_train=True,
            max_tokens_text=250,
            max_tokens_title=50):
 def get_encoderdecoder_model(self):
     return EncoderDecoderModel.from_pretrained(
         "patrickvonplaten/bert2bert-cnn_dailymail-fp16")
示例#29
0
    def test_finetune_bert2bert(self):
        if not is_datasets_available():
            return

        import datasets

        bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained(
            "prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

        bert2bert.config.vocab_size = bert2bert.config.encoder.vocab_size
        bert2bert.config.eos_token_id = tokenizer.sep_token_id
        bert2bert.config.decoder_start_token_id = tokenizer.cls_token_id
        bert2bert.config.max_length = 128

        train_dataset = datasets.load_dataset("cnn_dailymail",
                                              "3.0.0",
                                              split="train[:1%]")
        val_dataset = datasets.load_dataset("cnn_dailymail",
                                            "3.0.0",
                                            split="validation[:1%]")

        train_dataset = train_dataset.select(range(32))
        val_dataset = val_dataset.select(range(16))

        rouge = datasets.load_metric("rouge")

        batch_size = 4

        def _map_to_encoder_decoder_inputs(batch):
            # Tokenizer will automatically set [BOS] <text> [EOS]
            inputs = tokenizer(batch["article"],
                               padding="max_length",
                               truncation=True,
                               max_length=512)
            outputs = tokenizer(batch["highlights"],
                                padding="max_length",
                                truncation=True,
                                max_length=128)
            batch["input_ids"] = inputs.input_ids
            batch["attention_mask"] = inputs.attention_mask

            batch["decoder_input_ids"] = outputs.input_ids
            batch["labels"] = outputs.input_ids.copy()
            batch["labels"] = [[
                -100 if token == tokenizer.pad_token_id else token
                for token in labels
            ] for labels in batch["labels"]]
            batch["decoder_attention_mask"] = outputs.attention_mask

            assert all([len(x) == 512 for x in inputs.input_ids])
            assert all([len(x) == 128 for x in outputs.input_ids])

            return batch

        def _compute_metrics(pred):
            labels_ids = pred.label_ids
            pred_ids = pred.predictions

            # all unnecessary tokens are removed
            pred_str = tokenizer.batch_decode(pred_ids,
                                              skip_special_tokens=True)
            label_str = tokenizer.batch_decode(labels_ids,
                                               skip_special_tokens=True)

            rouge_output = rouge.compute(predictions=pred_str,
                                         references=label_str,
                                         rouge_types=["rouge2"])["rouge2"].mid

            return {
                "rouge2_precision": round(rouge_output.precision, 4),
                "rouge2_recall": round(rouge_output.recall, 4),
                "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
            }

        # map train dataset
        train_dataset = train_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        train_dataset.set_format(
            type="torch",
            columns=[
                "input_ids", "attention_mask", "decoder_input_ids",
                "decoder_attention_mask", "labels"
            ],
        )

        # same for validation dataset
        val_dataset = val_dataset.map(
            _map_to_encoder_decoder_inputs,
            batched=True,
            batch_size=batch_size,
            remove_columns=["article", "highlights"],
        )
        val_dataset.set_format(
            type="torch",
            columns=[
                "input_ids", "attention_mask", "decoder_input_ids",
                "decoder_attention_mask", "labels"
            ],
        )

        output_dir = self.get_auto_remove_tmp_dir()

        training_args = Seq2SeqTrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
            evaluate_during_training=True,
            do_train=True,
            do_eval=True,
            warmup_steps=0,
            eval_steps=2,
            logging_steps=2,
        )

        # instantiate trainer
        trainer = Seq2SeqTrainer(
            model=bert2bert,
            args=training_args,
            compute_metrics=_compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
        )

        # start training
        trainer.train()
    eos_token_id=decoder_tokenizer.vocab["[EOS]"],
)
# AutoConfig.from_pretrained("bert-base-uncased")
#decoder_config = BertGenerationDecoderConfig()

# From: https://github.com/huggingface/transformers/blob/master/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py#L464
#>>> model.config.decoder_start_token_id = tokenizer.cls_token_id
#>>> model.config.pad_token_id = tokenizer.pad_token_id
#>>> model.config.vocab_size = model.config.decoder.vocab_size
#decoder_config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"]
# decoder_config.pad_token_type_id = 0 ?
decoder = BertGenerationDecoder(config=decoder_config)

#enc_dec_config = EncoderDecoderConfig(encoder=encoder.config, decoder=decoder.config, decoder_start_token_id=decoder_tokenizer.vocab["[CLS]"])

bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
bert2bert.config.decoder_start_token_id = decoder_tokenizer.vocab["[CLS]"]
bert2bert.config.pad_token_id = decoder_tokenizer.vocab["[PAD]"]

# Tokenize inputs and labels.
inputs = encoder_tokenizer(
    'Make a stack of all blocks except the green block.',
    add_special_tokens=False,
    return_tensors="pt")
print("Inputs: ", inputs)
labels = decoder_tokenizer(
    "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)",
    return_tensors="pt",
    padding=True,
    truncation=True)