Exemplo n.º 1
0
 def model(self):
     """Only load the model if needed."""
     model = MBartForConditionalGeneration.from_pretrained(
         self.checkpoint_name).to(torch_device)
     if "cuda" in torch_device:
         model = model.half()
     return model
Exemplo n.º 2
0
 def test_mbart_fast_forward(self):
     config = MBartConfig(
         vocab_size=99,
         d_model=24,
         encoder_layers=2,
         decoder_layers=2,
         encoder_attention_heads=2,
         decoder_attention_heads=2,
         encoder_ffn_dim=32,
         decoder_ffn_dim=32,
         max_position_embeddings=48,
         add_final_layer_norm=True,
         return_dict=True,
     )
     lm_model = MBartForConditionalGeneration(config).to(torch_device)
     context = torch.Tensor([[71, 82, 18, 33, 46, 91, 2],
                             [68, 34, 26, 58, 30, 2,
                              1]]).long().to(torch_device)
     summary = torch.Tensor([[82, 71, 82, 18, 2],
                             [58, 68, 2, 1, 1]]).long().to(torch_device)
     result = lm_model(input_ids=context,
                       decoder_input_ids=summary,
                       labels=summary)
     expected_shape = (*summary.shape, config.vocab_size)
     self.assertEqual(result.logits.shape, expected_shape)
    def test_multilingual_translation(self):
        model = MBartForConditionalGeneration.from_pretrained(
            "facebook/mbart-large-50-many-to-many-mmt")
        tokenizer = MBart50TokenizerFast.from_pretrained(
            "facebook/mbart-large-50-many-to-many-mmt")

        translator = pipeline(task="translation",
                              model=model,
                              tokenizer=tokenizer)
        # Missing src_lang, tgt_lang
        with self.assertRaises(ValueError):
            translator("This is a test")

        outputs = translator("This is a test",
                             src_lang="en_XX",
                             tgt_lang="ar_AR")
        self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}])

        outputs = translator("This is a test",
                             src_lang="en_XX",
                             tgt_lang="hi_IN")
        self.assertEqual(outputs, [{"translation_text": "यह एक परीक्षण है"}])

        # src_lang, tgt_lang can be defined at pipeline call time
        translator = pipeline(task="translation",
                              model=model,
                              tokenizer=tokenizer,
                              src_lang="en_XX",
                              tgt_lang="ar_AR")
        outputs = translator("This is a test")
        self.assertEqual(outputs, [{"translation_text": "هذا إختبار"}])
Exemplo n.º 4
0
    def __init__(
        self,
        model_or_path: str = "facebook/mbart-large-50-many-to-many-mmt",
        tokenizer_path: str = None,
        device: str = "auto",
        model_options: dict = None,
        tokenizer_options: dict = None,
    ):
        """
        Instantiates a multilingual transformer model for translation.

        {{params}}
        {{model_or_path}} The path or the name of the model. Equivalent to the first argument of AutoModel.from_pretrained().
        {{device}} "cpu", "gpu" or "auto". If it's set to "auto", will try to select a GPU when available or else fallback to CPU.
        {{tokenizer_path}} The path to the tokenizer, only if it is different from `model_or_path`; otherwise, leave it as `None`.
        {{model_options}} The keyword arguments passed to the transformer model, which is a mBART-Large for condition generation.
        {{tokenizer_options}} The keyword arguments passed to the tokenizer model, which is a mBART-50 Fast Tokenizer.
        """
        self.model_or_path = model_or_path
        self.device = _select_device(device)

        # Resolve default values
        tokenizer_path = tokenizer_path or self.model_or_path
        model_options = model_options or {}
        tokenizer_options = tokenizer_options or {}

        self.tokenizer = MBart50TokenizerFast.from_pretrained(
            tokenizer_path, **tokenizer_options)

        if model_or_path.endswith(".pt"):
            self.bart_model = torch.load(model_or_path,
                                         map_location=self.device).eval()
        else:
            self.bart_model = (MBartForConditionalGeneration.from_pretrained(
                self.model_or_path, **model_options).to(self.device).eval())
Exemplo n.º 5
0
def predict(model_name, test_file, output_file, batch_size,
            max_source_tokens_count, max_target_tokens_count, use_cuda):
    tokenizer = MBartTokenizer.from_pretrained(model_name)
    test_dataset = MBartSummarizationDataset(test_file, tokenizer,
                                             max_source_tokens_count,
                                             max_target_tokens_count)
    device = torch.device("cuda:0") if use_cuda else torch.device("cpu")
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    model.to(device)
    predictions = []
    for batch in test_dataset:
        summaries = model.generate(
            input_ids=batch["input_ids"].to(device),
            attention_mask=dct["attention_mask"].to(device),
            num_beams=5,
            length_penalty=1.0,
            max_length=max_target_tokens_count + 2,
            min_length=5,
            no_repeat_ngram_size=0,
            early_stopping=True)
        for s in summaries:
            p = tokenizer.decode(s,
                                 skip_special_tokens=True,
                                 clean_up_tokenization_spaces=False)
            predictions.append(p)
    with open(output_file, "w") as w:
        for p in predictions:
            w.write(p.strip() + "\n")
Exemplo n.º 6
0
 def __init__(self) -> None:
     self.model = MBartForConditionalGeneration.from_pretrained(
         "facebook/mbart-large-50-many-to-many-mmt"
     )
     self.tokenizer = MBart50TokenizerFast.from_pretrained(
         "facebook/mbart-large-50-many-to-many-mmt"
     )
 def test_generate_fp16(self):
     config, input_dict = self.model_tester.prepare_config_and_inputs()
     input_ids = input_dict["input_ids"]
     attention_mask = input_ids.ne(1).to(torch_device)
     model = MBartForConditionalGeneration(config).eval().to(torch_device)
     if torch_device == "cuda":
         model.half()
     model.generate(input_ids, attention_mask=attention_mask)
     model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
Exemplo n.º 8
0
def get_summarization_agents():
    agents = {
        "model":
        MBartForConditionalGeneration.from_pretrained(
            "vasudevgupta/mbart-summarizer-interiit"),
        "tokenizer":
        MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
    }
    return agents
 def __init__(self):
     try:
         # using the latest model from facebook for many to many language translations
         model_name = "facebook/mbart-large-50-many-to-many-mmt"
         self.model = MBartForConditionalGeneration.from_pretrained(
             model_name)
         self.tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
     except Exception as e:
         logging.error(f"Error initializing model. {e}")
Exemplo n.º 10
0
 def __init__(self, src_lang, tgt_lang):
     super().__init__()
     self.batch_size = 16
     self.lr = 3e-5
     self.src_lang = src_lang
     self.tgt_lang = tgt_lang
     self.model = MBartForConditionalGeneration.from_pretrained(
         "facebook/mbart-large-en-ro"
     )
Exemplo n.º 11
0
    def __init__(self):

        self.model = MBartForConditionalGeneration.from_pretrained(
            'facebook/mbart-large-50-many-to-many-mmt')
        self.tokenizer = MBart50TokenizerFast.from_pretrained(
            'facebook/mbart-large-50-many-to-many-mmt')
        self.supported_langs = [
            'en_XX', 'gu_IN', 'hi_IN', 'bn_IN', 'ml_IN', 'mr_IN', 'ta_IN',
            'te_IN'
        ]
Exemplo n.º 12
0
    def __init__(
        self,
        hparams: Namespace,
    ):
        super().__init__()
        self.hparams = hparams

        self.tokenizer = MBartTokenizer.from_pretrained(
            self.hparams.model_checkpoint)
        self.model = MBartForConditionalGeneration.from_pretrained(
            self.hparams.model_checkpoint)
Exemplo n.º 13
0
 def __init__(self, config):
     model_name = config.get("model_name", None)
     model_path = config.get("model_path", None)
     device = config.get("device", 0)  # default on gpu 0
     self.tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
     self.model = MBartForConditionalGeneration.from_pretrained(model_path)
     self.model.eval()
     self.model.half()
     self.device = torch.device(
         "cpu" if device < 0 else "cuda:{}".format(device))
     if self.device.type == "cuda":
         self.model = self.model.to(self.device)
Exemplo n.º 14
0
def generate_summaries_or_translations(
    examples: List[str],
    out_file: str,
    model_name: str,
    batch_size: int = 8,
    device: str = DEFAULT_DEVICE,
    fp16=False,
    task="summarization",
    prefix=None,
    **generate_kwargs,
) -> Dict:
    """Save model.generate results to <out_file>, and return how long it took."""
    fout = Path(out_file).open("w", encoding="utf-8")
    model_name = str(model_name)
    #model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
    model = MBartForConditionalGeneration.from_pretrained(model_name).to(
        device)
    if fp16:
        model = model.half()

    tokenizer = MBartTokenizer.from_pretrained(model_name)
    #tokenizer = AutoTokenizer.from_pretrained(model_name)
    #logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.

    start_time = time.time()
    # update config with task specific params
    use_task_specific_params(model, task)
    if prefix is None:
        prefix = prefix or getattr(model.config, "prefix", "") or ""
    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
        examples_chunk = [prefix + text for text in examples_chunk]
        batch = tokenizer(examples_chunk,
                          return_tensors="pt",
                          truncation=True,
                          padding="longest").to(device)
        summaries = model.generate(
            input_ids=batch.input_ids,
            attention_mask=batch.attention_mask,
            #**generate_kwargs,
        )
        dec = tokenizer.batch_decode(summaries,
                                     skip_special_tokens=True,
                                     clean_up_tokenization_spaces=False)
        for hypothesis in dec:
            fout.write(hypothesis + "\n")
            fout.flush()
    fout.close()
    runtime = int(time.time() - start_time)  # seconds
    n_obs = len(examples)
    return dict(n_obs=n_obs,
                runtime=runtime,
                seconds_per_sample=round(runtime / n_obs, 4))
Exemplo n.º 15
0
def convert_fairseq_mbart_checkpoint_from_disk(
        checkpoint_path,
        hf_config_path="facebook/mbart-large-en-ro",
        finetuned=False,
        mbart_50=False):
    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
    remove_ignore_keys_(state_dict)
    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]

    mbart_config = MBartConfig.from_pretrained(hf_config_path,
                                               vocab_size=vocab_size)
    if mbart_50 and finetuned:
        mbart_config.activation_function = "relu"

    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
    model = MBartForConditionalGeneration(mbart_config)
    model.model.load_state_dict(state_dict)

    if finetuned:
        model.lm_head = make_linear_from_emb(model.model.shared)

    return model
Exemplo n.º 16
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_path = os.path.join(bolt.ARTIFACT_DIR, 'MBart_translation.pt')

    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
    model = MBartForConditionalGeneration.from_pretrained(
        'facebook/mbart-large-cc25')
    print("loading model")
    model.load_state_dict(torch.load(model_path))
    print("model loaded")
    sentences_lst = "i love you"

    result = translate(sentences_lst, tokenizer, model, 3, device)
    print(result)
Exemplo n.º 17
0
    def __init__(self, cfg_path, cfg_name):
        """
        Constructor of BartForSeq2SeqLM

        Args:
            cfg_path (str): parents path
            cfg_name (str): config file name

        """
        super().__init__(**self.load_args(cfg_path, cfg_name))
        self.model = MBartForConditionalGeneration.from_pretrained(
            "facebook/mbart-large-cc25")

        if self.precision == 16:
            self.model = self.model.half()
Exemplo n.º 18
0
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
    model = MBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')

    # example_english_phrase = ["I love you", 'you hate i']
    # expected_translation_chinese = ["我中意你", '你憎我']
    print("Loading and processing data")
    en, yue = read_file("../MARIAN/en2yue/train.en", "../MARIAN/en2yue/train.yue")
    val_en, val_yue = read_file("../MARIAN/en2yue/val.en", '../MARIAN/en2yue/val.yue')


    train_dataset = token_(tokenizer, en, yue)
    loader = create_data_loader(train_dataset, 8)

    val_dataset = token_(tokenizer, val_en, val_yue)
    val_loader = create_data_loader(val_dataset, 8)

    EPOCHS = 10
    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    resultdir = bolt.ARTIFACT_DIR
    MODEL_SAVE_PATH = os.path.join(resultdir, 'MBart_translation.pt')

    print("Start training")

    best_val_loss = float('inf')
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 30)
        train_loss = train_epoch(model, loader, optimizer, scheduler, device)
        val_loss = evaluate_epoch(model, val_loader, device)
        print(f'Train_loss: {train_loss} | Val_loss: {val_loss}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), MODEL_SAVE_PATH)

        bolt.send_metrics({
            "Total_train_loss": train_loss,
            "Total_val_loss": val_loss
        })
Exemplo n.º 19
0
    def load(self, path):
        """
        Loads a model specified by path.

        Args:
            path: model path

        Returns:
            (model, tokenizer)
        """

        if path.startswith("Helsinki-NLP"):
            model = MarianMTModel.from_pretrained(path)
            tokenizer = MarianTokenizer.from_pretrained(path)
        else:
            model = MBartForConditionalGeneration.from_pretrained(path)
            tokenizer = MBart50TokenizerFast.from_pretrained(path)

        # Apply model initialization routines
        model = self.prepare(model)

        return (model, tokenizer)
def get_pipeline():
    model = MBartForConditionalGeneration.from_pretrained(
        "facebook/mbart-large-50-many-to-many-mmt")
    tokenizer = MBart50TokenizerFast.from_pretrained(
        "facebook/mbart-large-50-many-to-many-mmt")
    return model, tokenizer
Exemplo n.º 21
0
    length = len(data)
    num_batch = int(np.ceil(length / bs))

    for i in range(num_batch):
        begin = i * bs
        stop = min((i+1)*bs, length)
        source = src[begin:stop]
        target = tgt[begin:stop]

        sources = tokenizer(source, return_tensors='pt', max_length=ml, padding=True, truncation=True)
        targets = tokenizer(target, return_tensors='pt', max_length=256, padding=True, truncation=True)
        tar_ids = targets['input_ids']
        tar_mask = targets['attention_mask']
        src_ids = sources['input_ids']
        src_mask = sources['attention_mask']

        if peg:
            prefix = torch.tensor([0]).unsqueeze(0).repeat_interleave(tar_ids.shape[0], 0)
            tar_ids = torch.cat((prefix, tar_ids), 1)
            prefix = torch.tensor([1]).unsqueeze(0).repeat_interleave(tar_mask.shape[0], 0)
            tar_mask = torch.cat((prefix, tar_mask), 1)

        yield src_ids, tar_ids, src_mask, tar_mask


if __name__ == '__main__':
    from transformers import MBartForConditionalGeneration, MBartTokenizer

    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro", use_cache=False)
    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
    gen_bt(4, tokenizer, 'val', dataset='wmt', shuffle=False)
Exemplo n.º 22
0
                        default="main",
                        help="configurations defined in config.py")
    p_args = parser.parse_args()

    args = getattr(config, p_args.config)
    print(args)

    ## use this for running sweep
    # wandb.init(config=args.__dict__)
    # args = wandb.config
    # print(dict(args))

    tokenizer = MBartTokenizer.from_pretrained(args.tokenizer_id)

    if args.load_dir:
        bart = MBartForConditionalGeneration(args.bart_config)
        print(f"model is loaded from {args.load_dir}")
    else:
        bart = MBartForConditionalGeneration.from_pretrained(args.model_id)
        print(f"model is loaded from {args.model_id}")

    print("====Working on layers freezing====")
    bart.ffn_requires_grad_(args.enc_ffn_grad, args.dec_ffn_grad)
    bart.attn_requires_grad_(args.enc_attn_grad, args.dec_attn_grad,
                             args.cross_attn_grad)
    bart.embed_requires_grad_(args.embed_grad, args.pos_embed_grad)
    bart.norm_requires_grad_(args.enc_norm_grad, args.dec_norm_grad,
                             args.cross_attn_norm_grad)

    print("====Working on adding adapters====")
    bart.add_adapter_(
Exemplo n.º 23
0
def download_model():
    model_name = "facebook/mbart-large-50-many-to-many-mmt"
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = MBart50Tokenizer.from_pretrained(model_name)
    return model, tokenizer
Exemplo n.º 24
0
# hf-experiments
# @author Loreto Parisi (loretoparisi at gmail dot com)
# Copyright (c) 2020-2021 Loreto Parisi (loretoparisi at gmail dot com)
# HF: https://huggingface.co/facebook/mbart-large-50-one-to-many-mmt

import os
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

article_en = "The head of the United Nations says there is no military solution in Syria"
model = MBartForConditionalGeneration.from_pretrained(
    "facebook/mbart-large-50-one-to-many-mmt",
    cache_dir=os.getenv("cache_dir", "../../models"))
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50-one-to-many-mmt",
    src_lang="en_XX",
    cache_dir=os.getenv("cache_dir", "../../models"))

model_inputs = tokenizer(article_en, return_tensors="pt")

# translate from English to Hindi
generated_tokens = model.generate(
    **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["hi_IN"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => 'संयुक्त राष्ट्र के नेता कहते हैं कि सीरिया में कोई सैन्य समाधान नहीं है'

# translate from English to Chinese
generated_tokens = model.generate(
    **model_inputs, forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"])
decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => '联合国首脑说,叙利亚没有军事解决办法'
print(decoded)
Exemplo n.º 25
0
def main(params):
    """ Finetunes the mBart50 model on some languages and
    then evaluates the BLEU score for each direction."""

    if params.wandb:
        wandb.init(project='mnmt', entity='nlp-mnmt-project', group='finetuning',
            config={k: v for k, v in params.__dict__.items() if isinstance(v, (float, int, str, list))})

    new_root_path = params.location
    new_name = params.name
    logger = logging.TrainLogger(params)
    logger.make_dirs()
    logger.save_params()

    # load model and tokenizer
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50").to(device)
    optimizer = torch.optim.Adam(model.parameters())

    # scale in terms of max lr
    lr_scale = params.max_lr * np.sqrt(params.warmup_steps)
    scheduler = WarmupDecay(optimizer, params.warmup_steps, 1, lr_scale=lr_scale)

    # set dropout
    model.config.dropout = params.dropout 
    model.config.attention_dropout = params.dropout

    def pipeline(dataset, langs, batch_size, max_len):

        cols = ['input_ids_' + l for l in langs]

        def tokenize_fn(example):
            """apply tokenization"""
            l_tok = []
            for lang in langs:
                encoded = tokenizer.encode(example[lang])
                encoded[0] = tokenizer.lang_code_to_id[LANG_CODES[lang]]
                l_tok.append(encoded)
            return {'input_ids_' + l: tok for l, tok in zip(langs, l_tok)}

        def pad_seqs(examples):
            """Apply padding"""
            ex_langs = list(zip(*[tuple(ex[col] for col in cols) for ex in examples]))
            ex_langs = tuple(pad_sequence(x, batch_first=True, max_len=max_len) for x in ex_langs)
            return ex_langs

        dataset = filter_languages(dataset, langs)
        dataset = dataset.map(tokenize_fn)
        dataset.set_format(type='torch', columns=cols)
        num_examples = len(dataset)
        print('-'.join(langs) + ' : {} examples.'.format(num_examples))
        dataloader = torch.utils.data.DataLoader(dataset,
                                                batch_size=batch_size,
                                                collate_fn=pad_seqs)
        return dataloader, num_examples

    # load data
    dataset = load_dataset('ted_multi')
    train_dataset = dataset['train']
    test_dataset = dataset['validation' if params.split == 'val' else 'test']

    # preprocess splits for each direction
    num_train_examples = {}
    train_dataloaders, val_dataloaders, test_dataloaders = {}, {}, {}
    for l1, l2 in combinations(params.langs, 2):
        train_dataloaders[l1+'-'+l2], num_train_examples[l1+'-'+l2] = pipeline(
            train_dataset, [l1, l2], params.batch_size, params.max_len)
        test_dataloaders[l1+'-'+l2], _ = pipeline(test_dataset, [l1, l2], params.batch_size, params.max_len)

    # print dataset sizes
    for direction, num in num_train_examples.items():
        print(direction, ': {} examples.'.format(num))

    def freeze_layers(layers, unfreeze=False):
        for n in layers:
            for parameter in model.model.encoder.layers[n].parameters():
                parameter.requires_grad = unfreeze

    # define loss function
    if params.label_smoothing is not None:
        loss_object = LabelSmoothingLoss(params.label_smoothing)
        loss_fn = lambda out, tar: loss_object(out.logits, tar)
    else:
        loss_fn = lambda out, tar: out.loss

    # train the model
    _target = torch.tensor(1.0).to(device)
    def train_step(x, y, aux=False):

        y_inp, y_tar = y[:,:-1].contiguous(), y[:,1:].contiguous()
        enc_mask, dec_mask = (x != 0), (y_inp != 0)

        x, y_inp, y_tar, enc_mask, dec_mask = to_devices(
          (x, y_inp, y_tar, enc_mask, dec_mask), device)

        model.train()
        if aux: freeze_layers(params.frozen_layers, unfreeze=True)
        output = model(input_ids=x, decoder_input_ids=y_inp,
                   labels=y_tar, attention_mask=enc_mask,
                   decoder_attention_mask=dec_mask)
        optimizer.zero_grad()
        loss = loss_fn(output, y_tar)
        loss.backward(retain_graph=aux)

        if aux: freeze_layers(params.frozen_layers)
        torch.set_grad_enabled(aux)

        x_enc = output.encoder_last_hidden_state
        y_enc = model.model.encoder(y_inp, attention_mask=dec_mask)['last_hidden_state']
        x_enc = torch.max(x_enc + -999 * (1-enc_mask.type(x_enc.dtype)).unsqueeze(-1), dim=1)[0]
        y_enc = torch.max(y_enc + -999 * (1-dec_mask.type(y_enc.dtype)).unsqueeze(-1), dim=1)[0]
        aux_loss = F.cosine_embedding_loss(x_enc, y_enc, _target)
        scaled_aux_loss = params.aux_strength * aux_loss
        
        torch.set_grad_enabled(True)
        if aux: scaled_aux_loss.backward()

        optimizer.step()
        scheduler.step()

        accuracy = accuracy_fn(output.logits, y_tar)

        return loss.item(), aux_loss.item(), accuracy.item()

    # prepare iterators
    iterators = {direction: iter(loader) for direction, loader in train_dataloaders.items()}

    # compute sampling probabilites (and set zero shot directions to 0)
    num_examples = num_train_examples.copy()
    zero_shots = [(params.zero_shot[i]+'-'+params.zero_shot[i+1]) for i in range(0, len(params.zero_shot), 2)]
    for d in zero_shots:
        num_examples[d] = 0
    directions, num_examples = list(num_examples.keys()), np.array(list(num_examples.values()))
    dir_dist = (num_examples ** params.temp) / ((num_examples ** params.temp).sum())

    #train
    losses, aux_losses, accs = [], [], []
    start_ = time.time()
    for i in range(params.train_steps):

        # sample a direction
        direction = directions[int(np.random.choice(len(num_examples), p=dir_dist))]
        try: # check iterator is not exhausted
            x, y = next(iterators[direction])
        except StopIteration:
            iterators[direction] = iter(train_dataloaders[direction])
            x, y = next(iterators[direction])
        x, y = get_direction(x, y, sample=not params.single_direction)
           
        # train on the direction
        loss, aux_loss, acc = train_step(x, y, aux=params.auxiliary)
        losses.append(loss)
        aux_losses.append(aux_loss)
        accs.append(acc)

        if i % params.verbose == 0:
            print('Batch {} Loss {:.4f} Aux Loss {:.4f} Acc {:.4f} in {:.4f} secs per batch'.format(
                i, np.mean(losses[-params.verbose:]), np.mean(aux_losses[-params.verbose:]),
                np.mean(accs[-params.verbose:]), (time.time() - start_)/(i+1)))
        if params.wandb:
            wandb.log({'train_loss':loss, 'aux_loss':aux_loss, 'train_acc':acc})

    # save results
    if params.save:
        logger.save_model(params.train_steps, model, optimizer, scheduler=scheduler)
    
    train_results = {'loss':[np.mean(losses)], 'aux_loss':[np.mean(aux_losses)], 'accuarcy':[np.mean(accs)]}
    pd.DataFrame(train_results).to_csv(logger.root_path + '/train_results.csv', index=False)

    # evaluate the model
    def evaluate(x, y, y_code, bleu):
        y_inp, y_tar = y[:,:-1].contiguous(), y[:,1:].contiguous()
        enc_mask = (x != 0)
        x, y_inp, y_tar, enc_mask = to_devices(
          (x, y_inp, y_tar, enc_mask), device)
        
        model.eval()
        y_pred = model.generate(input_ids=x, decoder_start_token_id=y_code,
            attention_mask=enc_mask, max_length=params.max_len+1,
            num_beams=params.num_beams, length_penalty=params.length_penalty,
            early_stopping=True)
        bleu(y_pred[:,1:], y_tar)

    test_results = {}
    for direction, loader in test_dataloaders.items():
        alt_direction = '-'.join(reversed(direction.split('-')))
        bleu1, bleu2 = BLEU(), BLEU()
        bleu1.set_excluded_indices([0, 2])
        bleu2.set_excluded_indices([0, 2])
        x_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-')[0]]]
        y_code = tokenizer.lang_code_to_id[LANG_CODES[direction.split('-')[-1]]]

        start_ = time.time()
        for i, (x, y) in enumerate(loader):
            if params.test_batches is not None:
                if i > params.test_batches:
                    break

            evaluate(x, y, y_code, bleu1)
            if not params.single_direction:
                evaluate(y, x, x_code, bleu2)
            if i % params.verbose == 0:
                bl1, bl2 = bleu1.get_metric(), bleu2.get_metric()
                print('Batch {} Bleu1 {:.4f} Bleu2 {:.4f} in {:.4f} secs per batch'.format(
                    i, bl1, bl2, (time.time() - start_)/(i+1)))
                if params.wandb:
                    wandb.log({'Bleu1':bl1, 'Bleu2':bl2})

        test_results[direction] = [bleu1.get_metric()]
        test_results[alt_direction] = [bleu2.get_metric()]

    # save test_results
    pd.DataFrame(test_results).to_csv(logger.root_path + '/test_results.csv', index=False)

    if params.wandb:
        wandb.finish()
Exemplo n.º 26
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    check_output_dir(training_args)

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = MBartConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout",
                          "attention_dropout")
    for p in extra_model_params:
        if getattr(training_args, p, None):
            assert hasattr(
                config, p
            ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
            setattr(config, p, getattr(training_args, p))

    tokenizer = MBartTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    # model = MBartForConditionalGeneration.from_pretrained(
    #     model_args.model_name_or_path,
    #     from_tf=".ckpt" in model_args.model_name_or_path,
    #     config=config,
    #     cache_dir=model_args.cache_dir,
    # )
    # model = MBartForConditionalGeneration(config)
    model = MBartForConditionalGeneration.from_pretrained(
        model_args.config_name)

    # use task specific params
    use_task_specific_params(model, data_args.task)

    # set num_beams for evaluation
    if data_args.eval_beams is None:
        data_args.eval_beams = model.config.num_beams

    # set decoder_start_token_id for MBart
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, MBartTokenizer):
        assert (data_args.tgt_lang is not None and data_args.src_lang
                is not None), "mBart requires --tgt_lang and --src_lang"
        model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
            data_args.tgt_lang]

    if model_args.freeze_embeds:
        freeze_embeds(model)
    if model_args.freeze_encoder:
        freeze_params(model.get_encoder())
        assert_all_frozen(model.get_encoder())

    dataset_class = Seq2SeqDataset

    # Get datasets
    train_dataset = (dataset_class(
        tokenizer,
        type_path="train",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_train,
        max_target_length=data_args.max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_train else None)
    eval_dataset = (dataset_class(
        tokenizer,
        type_path="val",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_val,
        max_target_length=data_args.val_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_eval or
                    training_args.evaluation_strategy != EvaluationStrategy.NO
                    else None)
    test_dataset = (dataset_class(
        tokenizer,
        type_path="test",
        data_dir=data_args.data_dir,
        n_obs=data_args.n_test,
        max_target_length=data_args.test_max_target_length,
        max_source_length=data_args.max_source_length,
        prefix=model.config.prefix or "",
    ) if training_args.do_predict else None)

    # Initialize our Trainer
    compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer)
                          if training_args.predict_with_generate else None)
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=Seq2SeqDataCollator(tokenizer, data_args,
                                          training_args.tpu_num_cores),
        compute_metrics=compute_metrics_fn,
        tokenizer=tokenizer,
    )

    all_metrics = {}
    # Training
    if training_args.do_train:
        logger.info("*** Train ***")

        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        metrics = train_result.metrics
        metrics["train_n_objs"] = data_args.n_train

        trainer.save_model()  # this also saves the tokenizer

        if trainer.is_world_process_zero():
            handle_metrics("train", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

            # For convenience, we also re-save the tokenizer to the same directory,
            # so that you can share your model easily on huggingface.co/models =)
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(metric_key_prefix="val",
                                   max_length=data_args.val_max_target_length,
                                   num_beams=data_args.eval_beams)
        metrics["val_n_objs"] = data_args.n_val
        metrics["val_loss"] = round(metrics["val_loss"], 4)

        if trainer.is_world_process_zero():

            handle_metrics("val", metrics, training_args.output_dir)
            all_metrics.update(metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_output = trainer.predict(
            test_dataset=test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.eval_beams,
        )
        metrics = test_output.metrics
        metrics["test_n_objs"] = data_args.n_test

        if trainer.is_world_process_zero():
            metrics["test_loss"] = round(metrics["test_loss"], 4)
            handle_metrics("test", metrics, training_args.output_dir)
            all_metrics.update(metrics)

            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_output.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = lmap(str.strip, test_preds)
                write_txt_file(
                    test_preds,
                    os.path.join(training_args.output_dir,
                                 "test_generations.txt"))

    if trainer.is_world_process_zero():
        save_json(all_metrics,
                  os.path.join(training_args.output_dir, "all_results.json"))

    return all_metrics
Exemplo n.º 27
0
 def __init__(self, model_path: str, device: str = 'cuda') -> None:
     self.device = device
     self.model = MBartForConditionalGeneration.from_pretrained(model_path).to(device)
     self.tokenizer = MBart50Tokenizer.from_pretrained(model_path)
Exemplo n.º 28
0
 def __init__(self):
     self._model: MBartForConditionalGeneration = MBartForConditionalGeneration.from_pretrained(
         MODEL_PATH)
Exemplo n.º 29
0
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import pandas as pd

#########
# Mbart50
#########

path_to_new_dataset = '../../../03_dataset/task_01/subtask1-document/additional_training_data'

model = MBartForConditionalGeneration.from_pretrained(
    "facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50-many-to-many-mmt")

translate_sentence = 'I like icecream.'

# translate Eng to Hindi
tokenizer.src_lang = "en_XX"
encoded_hi = tokenizer(translate_sentence, return_tensors="pt")
generated_tokens = model.generate(
    **encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["de_DE"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

# ACLED (EN)
acled_en_pos = pd.read_json(f"{path_to_new_dataset}/acled_eng.json",
                            lines=True).rename(columns={
                                "notes": "text",
                                "label": "label"
                            })
acled_en_pos_select = acled_en_pos[:6928]
Exemplo n.º 30
0
    def setup(self, process_on_fly=True, n_augment=0):

        if process_on_fly:
            data = load_dataset("csv", data_files=self.file_path)["train"]
            data = data.map(
                lambda x: {"article_length": len(x["Text"].split())})
            data = data.map(
                lambda x: {"summary_length": len(x["Headline"].split())})

            data = data.map(lambda x: {
                "CleanedText":
                preprocess_article(x["cleaned"], self.sep_token)
            })

            data = data.map(lambda x: {"CleanedHeadline": x["Headline"]})
            fn_kwargs = {
                "model":
                MBartForConditionalGeneration.from_pretrained(
                    "vasudevgupta/mbart-iitb-hin-eng"),
                "tokenizer":
                MBartTokenizer.from_pretrained(
                    "vasudevgupta/mbart-iitb-hin-eng"),
                "max_pred_length":
                32,
            }

            data = data.map(translate, fn_kwargs=fn_kwargs)
            data.to_csv(f"cleaned-{self.file_path}")

        else:
            data = load_dataset(
                "csv", data_files=f"cleaned-{self.file_path}")["train"]

        data = data.filter(
            lambda x: x["article_length"] > 32 and x["summary_length"] > 1)

        removed_samples = data.filter(lambda x: type(x["CleanedHeadline"]) !=
                                      str or type(x["CleanedText"]) != str)
        print(removed_samples["CleanedHeadline"])
        print(removed_samples["CleanedText"])

        data = data.filter(lambda x: type(x["CleanedHeadline"]) == str and
                           type(x["CleanedText"]) == str)
        print("Dataset", data)

        # print("Samples with article length > 560 are", data.filter(lambda x: x["article_length"] > 560))

        data = data.train_test_split(test_size=600,
                                     shuffle=True,
                                     seed=self.seed)
        tr_dataset = data["train"].map(lambda x: {"split": "TRAIN"})
        val_dataset = data["test"].map(lambda x: {"split": "VALIDATION"})

        if n_augment > 0:
            print("AUGMENTING")
            tr_dataset = tr_dataset.map(
                lambda x: {"augmentation_status": "Not Augmented"})
            val_dataset = val_dataset.map(
                lambda x: {"augmentation_status": "Not Augmented"})
            noisy_dataset = tr_dataset.filter(
                lambda x: x["Mobile_Tech_Flag"] == 1)

            noisy_datasets = []
            for _ in range(n_augment):
                noisy_datasets.append(
                    noisy_dataset.map(lambda x: {
                        "CleanedText":
                        get_noisy_sent(x["CleanedText"].split())
                    }))
            noisy_dataset = concatenate_datasets(noisy_datasets)
            noisy_dataset = noisy_dataset.map(
                lambda x: {"augmentation_status": "Augmented"})

            tr_dataset = concatenate_datasets([noisy_dataset, tr_dataset])

        return tr_dataset, val_dataset