def save_len_file( tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs ): """Save max(src_len, tgt_len) for each example to allow dynamic batching.""" tok = AutoTokenizer.from_pretrained(tokenizer_name) train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs) pad = tok.pad_token_id def get_lens(ds): dl = tqdm( DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn), desc=str(ds.len_file), ) max_lens = [] for batch in dl: src_lens = batch["input_ids"].ne(pad).sum(1).tolist() tgt_lens = batch["labels"].ne(pad).sum(1).tolist() if consider_target: for src, tgt in zip(src_lens, tgt_lens): max_lens.append(max(src, tgt)) else: max_lens.extend(src_lens) return max_lens train_lens = get_lens(train_ds) val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs) val_lens = get_lens(val_ds) pickle_save(train_lens, train_ds.len_file) pickle_save(val_lens, val_ds.len_file)
def test_dataset_kwargs(self, tok_name): tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False) if tok_name == MBART_TINY: train_dataset = Seq2SeqDataset( tokenizer, data_dir=make_test_data_dir( tmp_dir=self.get_auto_remove_tmp_dir()), type_path="train", max_source_length=4, max_target_length=8, src_lang="EN", tgt_lang="FR", ) kwargs = train_dataset.dataset_kwargs assert "src_lang" in kwargs and "tgt_lang" in kwargs else: train_dataset = Seq2SeqDataset( tokenizer, data_dir=make_test_data_dir( tmp_dir=self.get_auto_remove_tmp_dir()), type_path="train", max_source_length=4, max_target_length=8, ) kwargs = train_dataset.dataset_kwargs assert "add_prefix_space" not in kwargs if tok_name != BART_TINY else "add_prefix_space" in kwargs assert len(kwargs) == 1 if tok_name == BART_TINY else len( kwargs) == 0
def get_dataset(self, type_path) -> Seq2SeqDataset: n_obs = self.n_obs[type_path] max_target_length = self.target_lens[type_path] dataset = Seq2SeqDataset( self.tokenizer, type_path=type_path, n_obs=n_obs, max_target_length=max_target_length, **self.dataset_kwargs, ) return dataset
def test_seq2seq_dataset_truncation(self, tok_name): tokenizer = AutoTokenizer.from_pretrained(tok_name) tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()) max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES) max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES) max_src_len = 4 max_tgt_len = 8 assert max_len_target > max_src_len # Will be truncated assert max_len_source > max_src_len # Will be truncated src_lang, tgt_lang = "ro_RO", "de_DE" # ignored for all but mbart, but never causes error. train_dataset = Seq2SeqDataset( tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=max_src_len, max_target_length=max_tgt_len, # ignored src_lang=src_lang, tgt_lang=tgt_lang, ) dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn) for batch in dataloader: assert isinstance(batch, dict) assert batch["attention_mask"].shape == batch["input_ids"].shape # show that articles were trimmed. assert batch["input_ids"].shape[1] == max_src_len # show that targets are the same len assert batch["labels"].shape[1] == max_tgt_len if tok_name != MBART_TINY: continue # check language codes in correct place batch["decoder_input_ids"] = shift_tokens_right( batch["labels"], tokenizer.pad_token_id) assert batch["decoder_input_ids"][ 0, 0].item() == tokenizer.lang_code_to_id[tgt_lang] assert batch["decoder_input_ids"][ 0, -1].item() == tokenizer.eos_token_id assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id assert batch["input_ids"][ 0, -1].item() == tokenizer.lang_code_to_id[src_lang] break # No need to test every batch
def _get_dataset(self, n_obs=1000, max_len=128): if os.getenv("USE_REAL_DATA", False): data_dir = "examples/seq2seq/wmt_en_ro" max_tokens = max_len * 2 * 64 if not Path(data_dir).joinpath("train.len").exists(): save_len_file(MARIAN_TINY, data_dir) else: data_dir = "examples/seq2seq/test_data/wmt_en_ro" max_tokens = max_len * 4 save_len_file(MARIAN_TINY, data_dir) tokenizer = AutoTokenizer.from_pretrained(MARIAN_TINY) ds = Seq2SeqDataset( tokenizer, data_dir=data_dir, type_path="train", max_source_length=max_len, max_target_length=max_len, n_obs=n_obs, ) return ds, max_tokens, tokenizer
def eval_data_dir( data_dir, save_dir: str, model_name: str, bs: int = 8, max_source_length: int = 1024, type_path="val", n_obs=None, fp16=False, task="summarization", local_rank=None, **generate_kwargs, ) -> Dict: """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json""" model_name = str(model_name) assert local_rank is not None torch.distributed.init_process_group(backend="nccl", rank=local_rank) save_dir = Path(save_dir) save_path = save_dir.joinpath(f"rank_{local_rank}_output.json") torch.cuda.set_device(local_rank) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. use_task_specific_params(model, task) # update config with task specific params if max_source_length is None: max_source_length = tokenizer.model_max_length ds = Seq2SeqDataset( tokenizer, data_dir, max_source_length, max_target_length=1024, type_path=type_path, n_obs=n_obs, prefix=model.config.prefix, ) # I set shuffle=True for a more accurate progress bar. # If all the longest samples are first, the prog bar estimate is too high at the beginning. sampler = ds.make_sortish_sampler(bs, distributed=True, add_extra_examples=False, shuffle=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn) results = [] for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device), **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) ids = batch["ids"] for i, pred in enumerate(preds): results.append(dict(pred=pred, id=ids[i].item())) save_json(results, save_path) return results, sampler.num_replicas
def eval_data_dir( data_dir, save_dir: str, model_name: str, bs: int = 8, max_source_length: int = 1024, type_path="val", n_obs=None, fp16=False, save_source=False, num_beams: int = 4, task="summarization", local_rank=None, **generate_kwargs, ) -> Dict: """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json""" model_name = str(model_name) assert local_rank is not None torch.distributed.init_process_group(backend="nccl", rank=local_rank) save_dir = Path(save_dir) save_path = save_dir.joinpath(f"rank_{local_rank}_output.json") torch.cuda.set_device(local_rank) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type. use_task_specific_params(model, task) # update config with task specific params if max_source_length is None: max_source_length = tokenizer.model_max_length ds = Seq2SeqDataset( tokenizer, data_dir, max_source_length, max_target_length=1024, type_path=type_path, n_obs=n_obs, prefix=model.config.prefix, ) sampler = ds.make_sortish_sampler(bs, distributed=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn) dec_kwargs = dict(skip_special_tokens=True, clean_up_tokenization_spaces=False) # tokenizer.decode results = [] for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device), num_beams=num_beams, **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, **dec_kwargs) labels = tokenizer.batch_decode(batch["labels"], **dec_kwargs) if save_source: docs = tokenizer.batch_decode(batch["input_ids"], **dec_kwargs) for i in range(len(labels)): label, pred = labels[i], preds[i] if save_source: results.append(dict(pred=pred, label=label, source=docs[i])) else: results.append(dict(pred=pred, label=label)) save_json(results, save_path) return results
def main(): parser = MyArgumentParser((InferenceArguments, )) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. (args, ) = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: (args, ) = parser.parse_args_into_dataclasses() params = dict( pretrained_model_name_or_path=args.model_name_or_path, cache_dir=args.cache_dir, ) config = AutoConfig.from_pretrained(**params) tokenizer = AutoTokenizer.from_pretrained(**params) model = AutoModelForSeq2SeqLM.from_pretrained(config=config, **params) if args.model_parameters: print("====== MODEL PARAMETER LOADING... ======\n" f" {args.model_parameters}") model.load_state_dict(torch.load(args.model_parameters)) max_length = args.test_max_target_length # set num_beams for evaluation num_beams = args.num_beams if args.num_beams else model.config.num_beams test_dataset = Seq2SeqDataset( tokenizer=tokenizer, type_path='test', data_dir=args.data_dir, max_target_length=args.test_max_target_length, max_source_length=args.max_source_length, ) test_sampler = SequentialSampler(test_dataset) data_collator = Seq2SeqDataCollator(tokenizer, args) test_dataloader = DataLoader( test_dataset, sampler=test_sampler, batch_size=args.per_device_test_batch_size, collate_fn=data_collator, drop_last=False, ) # prediction_loop description = "Prediction" batch_size = test_dataloader.batch_size num_examples = len(test_dataloader.dataset) print(f"***** Running {description} *****") print(f" Num examples = {num_examples}") print(f" Batch size = {batch_size}") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) res = [] for step, inputs in enumerate(test_dataloader): # prediction_step, generative based has_labels = "labels" in inputs # False # _prepare_inputs # 1. device로 보내기 # 2. memory에 _past 올리기 for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.to(device) gen_kwargs = {"max_length": max_length, "num_beams": num_beams} generated_tokens = model.generate( inputs['input_ids'], attention_mask=inputs['attention_mask'], **gen_kwargs, ) # in case the batch is shorter than max length, the output should be padded if generated_tokens.shape[-1] < gen_kwargs["max_length"]: # If PAD token is not defined at least EOS token has to be defined padded_tensor = tokenizer.pad_token_id * torch.ones( (generated_tokens.shape[0], gen_kwargs["max_length"]), dtype=generated_tokens.dtype, device=generated_tokens.device, ) padded_tensor[:, :generated_tokens.shape[-1]] = generated_tokens generated_tokens = padded_tensor loss = None labels = None res.extend(list(generated_tokens)) submit(args, tokenizer, res) print("Finished!")