Exemplo n.º 1
0
def save_len_file(
    tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs
):
    """Save max(src_len, tgt_len) for each example to allow dynamic batching."""
    tok = AutoTokenizer.from_pretrained(tokenizer_name)
    train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs)
    pad = tok.pad_token_id

    def get_lens(ds):
        dl = tqdm(
            DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn),
            desc=str(ds.len_file),
        )
        max_lens = []
        for batch in dl:
            src_lens = batch["input_ids"].ne(pad).sum(1).tolist()
            tgt_lens = batch["labels"].ne(pad).sum(1).tolist()
            if consider_target:
                for src, tgt in zip(src_lens, tgt_lens):
                    max_lens.append(max(src, tgt))
            else:
                max_lens.extend(src_lens)
        return max_lens

    train_lens = get_lens(train_ds)
    val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs)
    val_lens = get_lens(val_ds)
    pickle_save(train_lens, train_ds.len_file)
    pickle_save(val_lens, val_ds.len_file)
Exemplo n.º 2
0
 def test_dataset_kwargs(self, tok_name):
     tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
     if tok_name == MBART_TINY:
         train_dataset = Seq2SeqDataset(
             tokenizer,
             data_dir=make_test_data_dir(
                 tmp_dir=self.get_auto_remove_tmp_dir()),
             type_path="train",
             max_source_length=4,
             max_target_length=8,
             src_lang="EN",
             tgt_lang="FR",
         )
         kwargs = train_dataset.dataset_kwargs
         assert "src_lang" in kwargs and "tgt_lang" in kwargs
     else:
         train_dataset = Seq2SeqDataset(
             tokenizer,
             data_dir=make_test_data_dir(
                 tmp_dir=self.get_auto_remove_tmp_dir()),
             type_path="train",
             max_source_length=4,
             max_target_length=8,
         )
         kwargs = train_dataset.dataset_kwargs
         assert "add_prefix_space" not in kwargs if tok_name != BART_TINY else "add_prefix_space" in kwargs
         assert len(kwargs) == 1 if tok_name == BART_TINY else len(
             kwargs) == 0
Exemplo n.º 3
0
 def get_dataset(self, type_path) -> Seq2SeqDataset:
     n_obs = self.n_obs[type_path]
     max_target_length = self.target_lens[type_path]
     dataset = Seq2SeqDataset(
         self.tokenizer,
         type_path=type_path,
         n_obs=n_obs,
         max_target_length=max_target_length,
         **self.dataset_kwargs,
     )
     return dataset
Exemplo n.º 4
0
    def test_seq2seq_dataset_truncation(self, tok_name):
        tokenizer = AutoTokenizer.from_pretrained(tok_name)
        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
        max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
        max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
        max_src_len = 4
        max_tgt_len = 8
        assert max_len_target > max_src_len  # Will be truncated
        assert max_len_source > max_src_len  # Will be truncated
        src_lang, tgt_lang = "ro_RO", "de_DE"  # ignored for all but mbart, but never causes error.
        train_dataset = Seq2SeqDataset(
            tokenizer,
            data_dir=tmp_dir,
            type_path="train",
            max_source_length=max_src_len,
            max_target_length=max_tgt_len,  # ignored
            src_lang=src_lang,
            tgt_lang=tgt_lang,
        )
        dataloader = DataLoader(train_dataset,
                                batch_size=2,
                                collate_fn=train_dataset.collate_fn)
        for batch in dataloader:
            assert isinstance(batch, dict)
            assert batch["attention_mask"].shape == batch["input_ids"].shape
            # show that articles were trimmed.
            assert batch["input_ids"].shape[1] == max_src_len
            # show that targets are the same len
            assert batch["labels"].shape[1] == max_tgt_len
            if tok_name != MBART_TINY:
                continue
            # check language codes in correct place
            batch["decoder_input_ids"] = shift_tokens_right(
                batch["labels"], tokenizer.pad_token_id)
            assert batch["decoder_input_ids"][
                0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
            assert batch["decoder_input_ids"][
                0, -1].item() == tokenizer.eos_token_id
            assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
            assert batch["input_ids"][
                0, -1].item() == tokenizer.lang_code_to_id[src_lang]

            break  # No need to test every batch
Exemplo n.º 5
0
    def _get_dataset(self, n_obs=1000, max_len=128):
        if os.getenv("USE_REAL_DATA", False):
            data_dir = "examples/seq2seq/wmt_en_ro"
            max_tokens = max_len * 2 * 64
            if not Path(data_dir).joinpath("train.len").exists():
                save_len_file(MARIAN_TINY, data_dir)
        else:
            data_dir = "examples/seq2seq/test_data/wmt_en_ro"
            max_tokens = max_len * 4
            save_len_file(MARIAN_TINY, data_dir)

        tokenizer = AutoTokenizer.from_pretrained(MARIAN_TINY)
        ds = Seq2SeqDataset(
            tokenizer,
            data_dir=data_dir,
            type_path="train",
            max_source_length=max_len,
            max_target_length=max_len,
            n_obs=n_obs,
        )
        return ds, max_tokens, tokenizer
def eval_data_dir(
    data_dir,
    save_dir: str,
    model_name: str,
    bs: int = 8,
    max_source_length: int = 1024,
    type_path="val",
    n_obs=None,
    fp16=False,
    task="summarization",
    local_rank=None,
    **generate_kwargs,
) -> Dict:
    """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
    model_name = str(model_name)
    assert local_rank is not None
    torch.distributed.init_process_group(backend="nccl", rank=local_rank)

    save_dir = Path(save_dir)
    save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
    torch.cuda.set_device(local_rank)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
    if fp16:
        model = model.half()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}"
                )  # if this is wrong, check config.model_type.
    use_task_specific_params(model,
                             task)  # update config with task specific params
    if max_source_length is None:
        max_source_length = tokenizer.model_max_length
    ds = Seq2SeqDataset(
        tokenizer,
        data_dir,
        max_source_length,
        max_target_length=1024,
        type_path=type_path,
        n_obs=n_obs,
        prefix=model.config.prefix,
    )
    # I set shuffle=True for a more accurate progress bar.
    # If all the longest samples are first, the prog bar estimate is too high at the beginning.
    sampler = ds.make_sortish_sampler(bs,
                                      distributed=True,
                                      add_extra_examples=False,
                                      shuffle=True)
    data_loader = DataLoader(ds,
                             sampler=sampler,
                             batch_size=bs,
                             collate_fn=ds.collate_fn)
    results = []
    for batch in tqdm(data_loader):
        summaries = model.generate(
            input_ids=batch["input_ids"].to(model.device),
            attention_mask=batch["attention_mask"].to(model.device),
            **generate_kwargs,
        )
        preds = tokenizer.batch_decode(summaries,
                                       skip_special_tokens=True,
                                       clean_up_tokenization_spaces=False)
        ids = batch["ids"]
        for i, pred in enumerate(preds):
            results.append(dict(pred=pred, id=ids[i].item()))
    save_json(results, save_path)
    return results, sampler.num_replicas
Exemplo n.º 7
0
def eval_data_dir(
    data_dir,
    save_dir: str,
    model_name: str,
    bs: int = 8,
    max_source_length: int = 1024,
    type_path="val",
    n_obs=None,
    fp16=False,
    save_source=False,
    num_beams: int = 4,
    task="summarization",
    local_rank=None,
    **generate_kwargs,
) -> Dict:
    """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
    model_name = str(model_name)
    assert local_rank is not None
    torch.distributed.init_process_group(backend="nccl", rank=local_rank)

    save_dir = Path(save_dir)
    save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
    torch.cuda.set_device(local_rank)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
    if fp16:
        model = model.half()

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
    use_task_specific_params(model, task)  # update config with task specific params
    if max_source_length is None:
        max_source_length = tokenizer.model_max_length
    ds = Seq2SeqDataset(
        tokenizer,
        data_dir,
        max_source_length,
        max_target_length=1024,
        type_path=type_path,
        n_obs=n_obs,
        prefix=model.config.prefix,
    )
    sampler = ds.make_sortish_sampler(bs, distributed=True)
    data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn)
    dec_kwargs = dict(skip_special_tokens=True, clean_up_tokenization_spaces=False)  # tokenizer.decode
    results = []
    for batch in tqdm(data_loader):
        summaries = model.generate(
            input_ids=batch["input_ids"].to(model.device),
            attention_mask=batch["attention_mask"].to(model.device),
            num_beams=num_beams,
            **generate_kwargs,
        )
        preds = tokenizer.batch_decode(summaries, **dec_kwargs)
        labels = tokenizer.batch_decode(batch["labels"], **dec_kwargs)
        if save_source:
            docs = tokenizer.batch_decode(batch["input_ids"], **dec_kwargs)
        for i in range(len(labels)):
            label, pred = labels[i], preds[i]
            if save_source:
                results.append(dict(pred=pred, label=label, source=docs[i]))
            else:
                results.append(dict(pred=pred, label=label))
    save_json(results, save_path)
    return results
Exemplo n.º 8
0
def main():
    parser = MyArgumentParser((InferenceArguments, ))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        (args, ) = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        (args, ) = parser.parse_args_into_dataclasses()

    params = dict(
        pretrained_model_name_or_path=args.model_name_or_path,
        cache_dir=args.cache_dir,
    )

    config = AutoConfig.from_pretrained(**params)
    tokenizer = AutoTokenizer.from_pretrained(**params)
    model = AutoModelForSeq2SeqLM.from_pretrained(config=config, **params)

    if args.model_parameters:
        print("====== MODEL PARAMETER LOADING... ======\n"
              f"   {args.model_parameters}")
        model.load_state_dict(torch.load(args.model_parameters))

    max_length = args.test_max_target_length

    # set num_beams for evaluation
    num_beams = args.num_beams if args.num_beams else model.config.num_beams

    test_dataset = Seq2SeqDataset(
        tokenizer=tokenizer,
        type_path='test',
        data_dir=args.data_dir,
        max_target_length=args.test_max_target_length,
        max_source_length=args.max_source_length,
    )

    test_sampler = SequentialSampler(test_dataset)

    data_collator = Seq2SeqDataCollator(tokenizer, args)

    test_dataloader = DataLoader(
        test_dataset,
        sampler=test_sampler,
        batch_size=args.per_device_test_batch_size,
        collate_fn=data_collator,
        drop_last=False,
    )

    # prediction_loop
    description = "Prediction"

    batch_size = test_dataloader.batch_size
    num_examples = len(test_dataloader.dataset)

    print(f"***** Running {description} *****")
    print(f"  Num examples = {num_examples}")
    print(f"  Batch size = {batch_size}")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    res = []
    for step, inputs in enumerate(test_dataloader):
        # prediction_step, generative based
        has_labels = "labels" in inputs  # False
        # _prepare_inputs
        #  1. device로 보내기
        #  2. memory에 _past 올리기
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                inputs[k] = v.to(device)
        gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
        generated_tokens = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            **gen_kwargs,
        )
        # in case the batch is shorter than max length, the output should be padded
        if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
            # If PAD token is not defined at least EOS token has to be defined
            padded_tensor = tokenizer.pad_token_id * torch.ones(
                (generated_tokens.shape[0], gen_kwargs["max_length"]),
                dtype=generated_tokens.dtype,
                device=generated_tokens.device,
            )
            padded_tensor[:, :generated_tokens.shape[-1]] = generated_tokens
            generated_tokens = padded_tensor
        loss = None
        labels = None
        res.extend(list(generated_tokens))
    submit(args, tokenizer, res)
    print("Finished!")