def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.pkl" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" self.step_count = 0 self.metrics = {"train": [], "val": [], "test": []} self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()} self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.model.encoder) # TODO: this will break for t5 self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = 4 if self.hparams.gpus <= 1 else None # passing num_workers breaks lightning for multigpu
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1" if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
def generate_summaries_or_translations( examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, **gen_kwargs, ) -> None: fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) # update config with summarization specific params use_task_specific_params(model, "summarization") for batch in tqdm(list(chunks(examples, batch_size))): if "t5" in model_name: batch = [model.config.prefix + text for text in batch] batch = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", truncation=True, pad_to_max_length=True).to(device) summaries = model.generate(**batch, **gen_kwargs) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush()
def __init__(self, model_path): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to( self.device) self.tokenizer = AutoTokenizer.from_pretrained(model_path) task = "summarization" use_task_specific_params(self.model, task) self.batch_size = 1 self.decoder_start_token_id = None
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id if isinstance(self.tokenizer, MBartTokenizer) or isinstance( self.tokenizer, MarianTokenizer): self.dataset_class = TranslationDataset else: self.dataset_class = Seq2SeqDataset
def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str(save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params(teacher, hparams.task) # We copy good generation parameters to student by default student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir ) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty super().__init__(hparams, model=student, config=student.config) model_type = student.config.model_type self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if model_type == "t5": teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_encoder = hparams.student_encoder_layers != teacher_encoder_layers self.different_decoder = hparams.student_decoder_layers != teacher_decoder_layers self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise(n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise(n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()
def generate_summaries_or_translations( examples: List[str], out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", decoder_start_token_id=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. start_time = time.time() # update config with task specific params use_task_specific_params(model, task) for examples_chunk in tqdm(list(chunks(examples, batch_size))): if "t5" in model_name: examples_chunk = [ model.config.prefix + text for text in examples_chunk ] batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, decoder_start_token_id=decoder_start_token_id, **generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(examples) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") # save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) # self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # Entailment model self.entailment_tokenizer = AutoTokenizer.from_pretrained( 'textattack/roberta-base-MNLI') self.entailment_model = AutoModelForSequenceClassification.from_pretrained( 'textattack/roberta-base-MNLI') self.entailment_model = self.entailment_model.to('cuda')
def __init__(self, hparams, **kwargs): super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") # save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.num_workers = hparams.num_workers self.decoder_start_token_id = None self.dataset_class = Seq2SeqDataset self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length
def generate_summaries_or_translations( # examples: List[str], data_loader, tokenizer, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", decoder_start_token_id=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() start_time = time.time() # update config with task specific params use_task_specific_params(model, task) for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(device), attention_mask=batch["attention_mask"].to(device), encoder_answer_relevance_atten=batch['answer_relevance_atten'].to( device), # use_cache=True, decoder_start_token_id=decoder_start_token_id, **generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(data_loader) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
def generate_summaries_or_translations( examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", decoder_start_token_id=None, **gen_kwargs, ) -> None: fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() if decoder_start_token_id is None: decoder_start_token_id = gen_kwargs.pop("decoder_start_token_id", None) tokenizer = AutoTokenizer.from_pretrained(model_name) # update config with summarization specific params use_task_specific_params(model, task) for batch in tqdm(list(chunks(examples, batch_size))): if "t5" in model_name: batch = [model.config.prefix + text for text in batch] batch = tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length").to(device) input_ids, attention_mask = trim_batch( **batch, pad_token_id=tokenizer.pad_token_id) summaries = model.generate( input_ids=input_ids, attention_mask=attention_mask, decoder_start_token_id=decoder_start_token_id, **gen_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush()
def __init__(self, hparams): assert Path(hparams.data_dir).exists() student, student_cfg, teacher = self.pre_init(hparams) super().__init__(hparams, model=student, config=student_cfg) self.teacher = teacher use_task_specific_params(self.teacher, "summarization") freeze_params(self.teacher) self.sanity_check_gradients() self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid # self.alpha_cos = hparams.alpha_cos self.alpha_encoder_loss = self.hparams.alpha_encoder_loss gc.collect() torch.cuda.empty_cache()
def generate_summaries( examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=True, task="summarization", decoder_start_token_id=None, finetune_flag: int = 0, checkpoint_path: str = "", **gen_kwargs, ) -> None: fout = Path(out_file).open("w", encoding="utf-8") # initialize tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # if our goal is to evaluate the original checkpoint if finetune_flag < 1: # initialize the model checkpoints model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) # if our goal is to evaluate our fine-tuned checkpoint else: # load the finetuned checkpoints model = AutoModelForSeq2SeqLM.from_pretrained( f"{checkpoint_path}/best_tfmr").to(device) if fp16: model = model.half() if decoder_start_token_id is None: decoder_start_token_id = gen_kwargs.pop("decoder_start_token_id", None) # update config with summarization specific params use_task_specific_params(model, task) for batch in tqdm(list(chunks(examples, batch_size))): batch = tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length").to(device) input_ids, attention_mask = trim_batch( **batch, pad_token_id=tokenizer.pad_token_id) # ----------------------------------------- # Topic Modeling - GSM # ----------------------------------------- docs = [] # load dict dictionary = Dictionary.load(datapath('dict-www-cnndm-unigram')) # remove [SEP] sep_list = [ '[SEP_0]', '[SEP_1]', '[SEP_2]', '[SEP_3]', '[SEP_4]', '[SEP_5]', '[SEP_6]', '[SEP_7]', '[SEP_8]', '[SEP_9]' ] # vocab size for topic modeling vocab_size = len(dictionary) # load config for GSM config = yaml_load(f"data/config/gsm.yaml") # model config['hidden']['features'][0] = vocab_size # trainer batch config['trainer_batch']['test_sample'] = 1 config = extend_config_reference(config) gsm_trainer = config['GSMtrainer'] gsm_trainer['base_dir'] = f"log/bart-large-cnn-finetune" gsm_trainer = GSMTrainer.from_config(gsm_trainer) total_sample = len(batch['input_ids']) for batch_num in range(total_sample): # extract the batch_sentence batch_sentence = tokenizer.decode( batch['input_ids'][batch_num].tolist(), skip_special_tokens=True) # change to lowercase and split to list batch_sentence_list = batch_sentence.split(" ") # remove [SEP] batch_sentence_list_nosep = [ item for item in batch_sentence_list if item not in sep_list ] text = ' '.join([x for x in batch_sentence_list_nosep]) fine_text = text.replace(' ##', '').lower() batch_sentence = re.sub(r'[^\w\s]', '', fine_text) # batch_sentence: change to the cleaned news for topic modeling # change to training data format in topic modeling gsm_data_bow = dictionary.doc2bow(batch_sentence.split(" ")) docs.append(gsm_data_bow) # gsm_data: data for topic modeling gsm_data = DataLoader(DocDataset(docs, len(dictionary), device='cuda'), batch_size=config['dataset']['batch_size'], drop_last=False, num_workers=0) gsm_trainer.__dict__['train_iterator'] = gsm_data gsm_loss, gsm_p = gsm_trainer.co_train(vocab_size=vocab_size, training=False) del gsm_data topic_p = gsm_p.cuda() summaries = model.generate( input_ids=input_ids, attention_mask=attention_mask, decoder_start_token_id=decoder_start_token_id, topic_p=topic_p, **gen_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush()
def __init__(self, hparams): assert Path(hparams.data_dir).exists() self.output_dir = Path(hparams.output_dir) self.output_dir.mkdir(exist_ok=True) save_dir = self.output_dir.joinpath("student") hparams.model_name_or_path = str( save_dir) # Tell lightning we are training the student teacher = AutoModelForSeq2SeqLM.from_pretrained(hparams.teacher).eval() use_task_specific_params( teacher, hparams.task ) # We copy good generation parameters to student by default if hparams.student is not None: student = AutoModelForSeq2SeqLM.from_pretrained(hparams.student) use_task_specific_params(student, hparams.task) e_layer_ids, d_layer_ids = None, None else: student, e_layer_ids, d_layer_ids = create_student_by_copying_alternating_layers( teacher, e=hparams.student_encoder_layers, d=hparams.student_decoder_layers, save_path=save_dir) if hparams.length_penalty != -1: student.config.length_penalty = hparams.length_penalty hparams.tokenizer_name = hparams.teacher # Use teacher's tokenizer super().__init__(hparams, model=student, config=student.config) assert ( student.config.model_type == teacher.config.model_type ), f"teacher, student model types should be the same, got {student.config.model_type} != {teacher.config.model_type}" if student.config.model_type == "t5": student_encoder_layers = len(student.get_encoder().block) student_decoder_layers = len(student.get_decoder().block) teacher_encoder_layers = len(teacher.get_encoder().block) teacher_decoder_layers = len(teacher.get_decoder().block) else: student_encoder_layers = student.config.encoder_layers student_decoder_layers = student.config.decoder_layers teacher_encoder_layers = teacher.config.encoder_layers teacher_decoder_layers = teacher.config.decoder_layers self.different_base_models = not (hparams.student is None or hparams.teacher == hparams.student) self.do_calc_hidden_loss = ( not self.different_base_models) and hparams.alpha_hid > 0 self.different_encoder = self.different_base_models or ( student_encoder_layers != teacher_encoder_layers) # self.different_encoder determines whether we need to run the teacher encoder self.teacher = teacher freeze_params(self.teacher) if not self.different_encoder: # To save RAM, delete teacher encoder and freeze student encoder. try: del self.teacher.model.encoder except AttributeError: # T5 del self.teacher.encoder if e_layer_ids is None: e_layer_ids = list(range(student_encoder_layers)) if d_layer_ids is None: d_layer_ids = list(range(student_decoder_layers)) self.e_layer_ids, self.d_layer_ids = e_layer_ids, d_layer_ids # type: List[int], List[int] if self.do_calc_hidden_loss: # Intermediate supervision: Decide which layers to supervise if hparams.supervise_forward: self.e_matches = get_layers_to_supervise( n_student=len(self.e_layer_ids), n_teacher=teacher_encoder_layers) self.d_matches = get_layers_to_supervise( n_student=len(self.d_layer_ids), n_teacher=teacher_decoder_layers) else: # student layer should emulate hidden states of the teacher layer it was copied from self.e_matches = self.e_layer_ids self.d_matches = self.d_layer_ids else: self.e_matches = None self.d_matches = None self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean") self.temperature = 2.0 self.alpha_mlm = hparams.alpha_mlm self.alpha_ce = hparams.alpha_ce self.alpha_hid = hparams.alpha_hid gc.collect() torch.cuda.empty_cache()
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError( "Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError( "--sortish_sampler and --max_tokens_per_batch may not be used simultaneously" ) super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) # print(self.tokenizer.model_max_length) self.tokenizer.add_special_tokens({ 'additional_special_tokens': [ '<|HOME|>', '<|AWAY|>', '<|PLAYER-START_POSITION|>', '<|PLAYER-MIN|>', '<|PLAYER-PTS|>', '<|PLAYER-FGM|>', '<|PLAYER-FGA|>', '<|PLAYER-FG_PCT|>', '<|PLAYER-FG3M|>', '<|PLAYER-FG3A|>', '<|PLAYER-FG3_PCT|>', '<|PLAYER-FTM|>', '<|PLAYER-FTA|>', '<|PLAYER-FT_PCT|>', '<|PLAYER-OREB|>', '<|PLAYER-DREB|>', '<|PLAYER-REB|>', '<|PLAYER-AST|>', '<|PLAYER-TO|>', '<|PLAYER-STL|>', '<|PLAYER-BLK|>', '<|PLAYER-PF|>', '<|TEAM-PTS_QTR1|>', '<|TEAM-PTS_QTR2|>', '<|TEAM-PTS_QTR3|>', '<|TEAM-PTS_QTR4|>', '<|TEAM-PTS|>', '<|TEAM-FG_PCT|>', '<|TEAM-FG3_PCT|>', '<|TEAM-FT_PCT|>', '<|TEAM-REB|>', '<|TEAM-AST|>', '<|TEAM-TOV|>', '<|TEAM-WINS|>', '<|TEAM-LOSSES|>', '<|TEAM-CITY|>', '<|TEAM-NAME|>', ] }) # self.tokenizer.model_max_length = 1300 # self.tokenizer.max_length = 1300 self.model.resize_token_embeddings(len(self.tokenizer)) print(len(self.tokenizer)) use_task_specific_params(self.model, "summarization") # save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: self.freeze_embeds() if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) # self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1" if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric self.get_freq_sequences(self.hparams.data_dir) self.seq_loss_weight = 2
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = BartConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = BartTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = BartForConditionalGeneration.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate(metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def eval_data_dir( data_dir, save_dir: str, model_name: str, bs: int = 8, max_source_length: int = 1024, type_path="val", n_obs=None, fp16=False, save_source=False, num_beams: int = 4, task="summarization", local_rank=None, **generate_kwargs, ) -> Dict: """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json""" model_name = str(model_name) assert local_rank is not None torch.distributed.init_process_group(backend="nccl", rank=local_rank) save_dir = Path(save_dir) save_path = save_dir.joinpath(f"rank_{local_rank}_output.json") torch.cuda.set_device(local_rank) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}") # if this is wrong, check config.model_type. use_task_specific_params(model, task) # update config with task specific params if max_source_length is None: max_source_length = tokenizer.model_max_length ds = Seq2SeqDataset( tokenizer, data_dir, max_source_length, max_target_length=1024, type_path=type_path, n_obs=n_obs, prefix=model.config.prefix, ) sampler = ds.make_sortish_sampler(bs, distributed=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn) dec_kwargs = dict(skip_special_tokens=True, clean_up_tokenization_spaces=False) # tokenizer.decode results = [] for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device), num_beams=num_beams, **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, **dec_kwargs) labels = tokenizer.batch_decode(batch["labels"], **dec_kwargs) if save_source: docs = tokenizer.batch_decode(batch["input_ids"], **dec_kwargs) for i in range(len(labels)): label, pred = labels[i], preds[i] if save_source: results.append(dict(pred=pred, label=label, source=docs[i])) else: results.append(dict(pred=pred, label=label)) save_json(results, save_path) return results
def eval_data_dir( data_dir, save_dir: str, model_name: str, bs: int = 8, max_source_length: int = 1024, type_path="val", n_obs=None, fp16=False, task="summarization", local_rank=None, **generate_kwargs, ) -> Dict: """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json""" model_name = str(model_name) assert local_rank is not None torch.distributed.init_process_group(backend="nccl", rank=local_rank) save_dir = Path(save_dir) save_path = save_dir.joinpath(f"rank_{local_rank}_output.json") torch.cuda.set_device(local_rank) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda() if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. use_task_specific_params(model, task) # update config with task specific params if max_source_length is None: max_source_length = tokenizer.model_max_length ds = Seq2SeqDataset( tokenizer, data_dir, max_source_length, max_target_length=1024, type_path=type_path, n_obs=n_obs, prefix=model.config.prefix, ) # I set shuffle=True for a more accurate progress bar. # If all the longest samples are first, the prog bar estimate is too high at the beginning. sampler = ds.make_sortish_sampler(bs, distributed=True, add_extra_examples=False, shuffle=True) data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn) results = [] for batch in tqdm(data_loader): summaries = model.generate( input_ids=batch["input_ids"].to(model.device), attention_mask=batch["attention_mask"].to(model.device), **generate_kwargs, ) preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) ids = batch["ids"] for i, pred in enumerate(preds): results.append(dict(pred=pred, id=ids[i].item())) save_json(results, save_path) return results, sampler.num_replicas
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) check_output_dir(training_args) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, tokenizer=tokenizer, ) all_metrics = {} # Training if training_args.do_train: logger.info("*** Train ***") train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) metrics = train_result.metrics metrics["train_n_objs"] = data_args.n_train trainer.save_model() # this also saves the tokenizer if trainer.is_world_process_zero(): handle_metrics("train", metrics, training_args.output_dir) all_metrics.update(metrics) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) tokenizer.save_pretrained(training_args.output_dir) if training_args.tune: def eval_func_for_lpot(model): trainer.model = model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams) assert data_args.task.startswith("summarization") or data_args.task.startswith("translation") , \ "data_args.task should startswith summarization or translation" task_metrics_keys = [ 'val_bleu', 'val_rouge1', 'val_rouge2', 'val_rougeL', 'val_rougeLsum' ] for key in task_metrics_keys: if key in results.keys(): logger.info("Finally Eval {}:{}".format(key, results[key])) if 'bleu' in key: acc = results[key] break if 'rouge' in key: acc = sum( [v for k, v in results.items() if "rouge" in k]) / 4 break return acc from lpot.experimental import Quantization, common quantizer = Quantization("./conf.yaml") quantizer.model = common.Model(model) quantizer.calib_dataloader = common.DataLoader( eval_dataset, batch_size=training_args.eval_batch_size, collate_fn=Seq2SeqDataCollator_lpot(tokenizer, data_args, training_args.tpu_num_cores)) quantizer.eval_func = eval_func_for_lpot q_model = quantizer() q_model.save(training_args.tuned_checkpoint) exit(0) if training_args.benchmark: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, iters=training_args.iters, warmup_iter=training_args.warmup_iter, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Throughput: %.3f samples/sec' % (results["val_samples_per_second"])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) if training_args.accuracy_only: if training_args.int8: from lpot.utils.pytorch import load new_model = load( os.path.abspath( os.path.expanduser(training_args.tuned_checkpoint)), model) else: new_model = model trainer.model = new_model results = trainer.evaluate( eval_dataset=eval_dataset, metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) if data_args.task.startswith("summarization"): print('Accuracy: %.4f' % (sum([v for k, v in results.items() if "rouge" in k]) / 4)) if data_args.task.startswith("translation"): print('Accuracy: %.4f' % (results['val_bleu'])) print('Latency: %.3f ms' % (1 * 1000 / results["val_samples_per_second"])) print('Batch size = %d' % training_args.per_device_eval_batch_size) exit(0) # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate( metric_key_prefix="val", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics["val_n_objs"] = data_args.n_val metrics["val_loss"] = round(metrics["val_loss"], 4) if trainer.is_world_process_zero(): handle_metrics("val", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.do_predict: logger.info("*** Predict ***") test_output = trainer.predict( test_dataset=test_dataset, metric_key_prefix="test", max_length=data_args.val_max_target_length, num_beams=data_args.eval_beams, ) metrics = test_output.metrics metrics["test_n_objs"] = data_args.n_test if trainer.is_world_process_zero(): metrics["test_loss"] = round(metrics["test_loss"], 4) handle_metrics("test", metrics, training_args.output_dir) all_metrics.update(metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json")) return all_metrics
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout") for p in extra_model_params: if getattr(training_args, p, None): assert hasattr( config, p ), f"({config.__class__.__name__}) doesn't have a `{p}` attribute" setattr(config, p, getattr(training_args, p)) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is None: data_args.eval_beams = model.config.num_beams # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): assert (data_args.tgt_lang is not None and data_args.src_lang is not None), "mBart requires --tgt_lang and --src_lang" model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ data_args.tgt_lang] if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset if hasattr( tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer compute_metrics_fn = (build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None) trainer = Seq2SeqTrainer( model=model, config=config, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=compute_metrics_fn, data_args=data_args, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() if trainer.is_world_process_zero(): logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) save_json( result, os.path.join(training_args.output_dir, "eval_results.json")) eval_results.update(result) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) test_metrics = { k.replace("eval", "test"): v for k, v in test_output.metrics.items() } if trainer.is_world_process_zero(): logger.info("***** Test results *****") for key, value in test_metrics.items(): logger.info(" %s = %s", key, value) save_json( test_metrics, os.path.join(training_args.output_dir, "test_results.json")) eval_results.update(test_metrics) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode( test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True) test_preds = lmap(str.strip, test_preds) write_txt_file( test_preds, os.path.join(training_args.output_dir, "test_generations.txt")) if trainer.is_world_process_zero(): save_json(eval_results, "all_results.json") return eval_results
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError( "Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError( "--sortish_sampler and --max_tokens_per_batch may not be used simultaneously" ) super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.val_dataloader_names = [] self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: freeze_embeds(self.model) if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.already_saved_batch = False self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError("Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError("--sortish_sampler and --max_tokens_per_batch may not be used simultaneously") super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()} self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.max_target_length, "test": self.hparams.max_target_length, } assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: freeze_embeds(self.model) if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = ( Seq2SeqDataset if hasattr(self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset ) self.already_saved_batch = False self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric #for logging unlikelihood loss self.num_outputs = 0 self.num_ul = 0 #logging loss to plot training curves, a list of dicts #each dict contains the average loss (sum/batch size) for each kind of loss function #used to determine relative contributions of UL and standard cross entropy self.losses = []
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError( "Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError( "--sortish_sampler and --max_tokens_per_batch may not be used simultaneously" ) config = None model = None if hparams.model_name_or_path == "encoder-decoder": assert hparams.encoder_model_name_or_path is not None, "Encoder model path/name is None" assert hparams.decoder_model_name_or_path is not None, "Encoder model path/name is None" model = EncoderDecoderModel.from_encoder_decoder_pretrained( hparams.encoder_model_name_or_path, hparams.decoder_model_name_or_path) config = model.config super().__init__(hparams, num_labels=None, mode=self.mode, model=model, config=config, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type if self.model_type == "fsmt": self.vocab_size = self.config.tgt_vocab_size elif self.model_type == "encoder_decoder": self.vocab_size = self.config.decoder.vocab_size else: self.vocab_size = self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, # prefix=self.model.config.prefix or "", prefix="", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: freeze_embeds(self.model) if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id if isinstance(self.config, EncoderDecoderConfig): self.decoder_start_token_id = self.config.decoder.pad_token_id self.dataset_class = (Seq2SeqDataset if getattr( self.tokenizer, "prepare_seq2seq_batch").__qualname__.partition(".")[0] == self.tokenizer.__class__.__name__ else LegacySeq2SeqDataset) self.already_saved_batch = False self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric
def generate_summaries_or_translations( examples: List[str], out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE, fp16=False, task="summarization", prefix=None, **generate_kwargs, ) -> Dict: """Save model.generate results to <out_file>, and return how long it took.""" fout = Path(out_file).open("w", encoding="utf-8") model_name = str(model_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device) if fp16: model = model.half() tokenizer = AutoTokenizer.from_pretrained(model_name) logger.info(f"Inferred tokenizer type: {tokenizer.__class__}" ) # if this is wrong, check config.model_type. tokenizer.add_special_tokens({ 'additional_special_tokens': [ '<|HOME|>', '<|AWAY|>', '<|PLAYER-START_POSITION|>', '<|PLAYER-MIN|>', '<|PLAYER-PTS|>', '<|PLAYER-FGM|>', '<|PLAYER-FGA|>', '<|PLAYER-FG_PCT|>', '<|PLAYER-FG3M|>', '<|PLAYER-FG3A|>', '<|PLAYER-FG3_PCT|>', '<|PLAYER-FTM|>', '<|PLAYER-FTA|>', '<|PLAYER-FT_PCT|>', '<|PLAYER-OREB|>', '<|PLAYER-DREB|>', '<|PLAYER-REB|>', '<|PLAYER-AST|>', '<|PLAYER-TO|>', '<|PLAYER-STL|>', '<|PLAYER-BLK|>', '<|PLAYER-PF|>', '<|TEAM-PTS_QTR1|>', '<|TEAM-PTS_QTR2|>', '<|TEAM-PTS_QTR3|>', '<|TEAM-PTS_QTR4|>', '<|TEAM-PTS|>', '<|TEAM-FG_PCT|>', '<|TEAM-FG3_PCT|>', '<|TEAM-FT_PCT|>', '<|TEAM-REB|>', '<|TEAM-AST|>', '<|TEAM-TOV|>', '<|TEAM-WINS|>', '<|TEAM-LOSSES|>', '<|TEAM-CITY|>', '<|TEAM-NAME|>', ] }) # model.resize_token_embeddings(len(tokenizer)) max_length = 600 min_length = 275 # update config with task specific params use_task_specific_params(model, task) start_time = time.time() if prefix is None: prefix = prefix or getattr(model.config, "prefix", "") or "" batch = tokenizer(prefix + examples[0], return_tensors="pt", max_length=1024, truncation=True, padding="max_length").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, # num_beams=4, length_penalty=2.0, max_length=max_length + 2, # +2 from original because we start at step=1 and stop before max_length min_length=min_length + 1, # +1 from original because we start at step=1 no_repeat_ngram_size=3, decoder_start_token_id=model.config.eos_token_id, **generate_kwargs, ) print(summaries[0]) print(len(summaries[0])) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hyp in dec: print(hyp) for examples_chunk in tqdm(list(chunks(examples, batch_size))): examples_chunk = [prefix + text for text in examples_chunk] batch = tokenizer(examples_chunk, return_tensors="pt", max_length=1024, truncation=True, padding="max_length").to(device) summaries = model.generate( input_ids=batch.input_ids, attention_mask=batch.attention_mask, num_beams=4, length_penalty=2.0, max_length=max_length + 2, # +2 from original because we start at step=1 and stop before max_length min_length=min_length + 1, # +1 from original because we start at step=1 no_repeat_ngram_size=3, decoder_start_token_id=model.config.eos_token_id, **generate_kwargs, ) dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False) for hypothesis in dec: fout.write(hypothesis + "\n") fout.flush() fout.close() runtime = int(time.time() - start_time) # seconds n_obs = len(examples) return dict(n_obs=n_obs, runtime=runtime, seconds_per_sample=round(runtime / n_obs, 4))
def __init__(self, hparams, **kwargs): print(hparams) if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError( "Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError( "--sortish_sampler and --max_tokens_per_batch may not be used simultaneously" ) super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") # save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { self.hparams.train_name: self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = { k: v if v >= 0 else None for k, v in n_observations_per_split.items() } self.target_lens = { self.hparams.train_name: self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens[self.hparams.train_name] <= self.target_lens[ "val"], f"target_lens: {self.target_lens}" assert self.target_lens[self.hparams.train_name] <= self.target_lens[ "test"], f"target_lens: {self.target_lens}" if self.hparams.freeze_embeds: freeze_embeds(self.model) if self.hparams.freeze_encoder: freeze_params(self.model.get_encoder()) assert_all_frozen(self.model.get_encoder()) # self.tokenizer.add_special_tokens({"additional_special_tokens": ["<%s>"%i for i in range(100)]}) self.model.resize_token_embeddings(len(self.tokenizer)) self.vocab_size = len(self.tokenizer) print("vocab_size:", self.vocab_size) # self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance( self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[ hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = (Seq2SeqDataset if hasattr( self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset) self.already_saved_batch = False self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric num_param = 0 for name, param in self.model.named_parameters(): if param.requires_grad: # print(name, param.size(), torch.numel(param)) num_param += torch.numel(param) print("=" * 10) print("# Parameters:", num_param) print("=" * 10)
def __init__(self, hparams, **kwargs): if hparams.sortish_sampler and hparams.gpus > 1: hparams.replace_sampler_ddp = False elif hparams.max_tokens_per_batch is not None: if hparams.gpus > 1: raise NotImplementedError("Dynamic Batch size does not work for multi-gpu training") if hparams.sortish_sampler: raise ValueError("--sortish_sampler and --max_tokens_per_batch may not be used simultaneously") super().__init__(hparams, num_labels=None, mode=self.mode, **kwargs) use_task_specific_params(self.model, "summarization") save_git_info(self.hparams.output_dir) self.metrics_save_path = Path(self.output_dir) / "metrics.json" self.hparams_save_path = Path(self.output_dir) / "hparams.pkl" pickle_save(self.hparams, self.hparams_save_path) self.step_count = 0 self.metrics = defaultdict(list) self.model_type = self.config.model_type self.vocab_size = self.config.tgt_vocab_size if self.model_type == "fsmt" else self.config.vocab_size self.dataset_kwargs: dict = dict( data_dir=self.hparams.data_dir, max_source_length=self.hparams.max_source_length, prefix=self.model.config.prefix or "", ) n_observations_per_split = { "train": self.hparams.n_train, "val": self.hparams.n_val, "test": self.hparams.n_test, } self.n_obs = {k: v if v >= 0 else None for k, v in n_observations_per_split.items()} self.target_lens = { "train": self.hparams.max_target_length, "val": self.hparams.val_max_target_length, "test": self.hparams.test_max_target_length, } assert self.target_lens["train"] <= self.target_lens["val"], f"target_lens: {self.target_lens}" assert self.target_lens["train"] <= self.target_lens["test"], f"target_lens: {self.target_lens}" # if self.hparams.freeze_embeds: # self.freeze_embeds() freeze_params(self.seq2seq_model) assert_all_frozen(self.seq2seq_model) print('FREEZING ENTIRE seq2seq model.') # if self.hparams.freeze_encoder: # freeze_params(self.model.get_encoder()) # assert_all_frozen(self.model.get_encoder()) self.hparams.git_sha = get_git_info()["repo_sha"] self.num_workers = hparams.num_workers self.decoder_start_token_id = None # default to config if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer): self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang] self.model.config.decoder_start_token_id = self.decoder_start_token_id self.dataset_class = ( Seq2SeqDataset if hasattr(self.tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset ) self.eval_beams = self.model.config.num_beams if self.hparams.eval_beams is None else self.hparams.eval_beams assert self.eval_beams >= 1, f"got self.eval_beams={self.eval_beams}. Need an integer > 1" if self.hparams.eval_max_gen_length is not None: self.eval_max_length = self.hparams.eval_max_gen_length else: self.eval_max_length = self.model.config.max_length self.val_metric = self.default_val_metric if self.hparams.val_metric is None else self.hparams.val_metric self.training_acc_across_batches_at_curr_epoch = [] self.eval_max_length = 62 self.eval_min_length = 11 self.eval_beams =6 print('for deocding, eval_max_length={}, ' 'eval_min_length={}, eval_beams={}'.format(self.eval_max_length, self.eval_min_length, self.eval_beams))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSeq2SeqLM.from_pretrained( model_args.model_name_or_path, from_tf=".ckpt" in model_args.model_name_or_path, config=config, cache_dir=model_args.cache_dir, ) # use task specific params use_task_specific_params(model, data_args.task) # set num_beams for evaluation if data_args.eval_beams is not None: model.config.num_beams = data_args.eval_beams assert model.config.num_beams >= 1, f"got eval_beams={model.config.num_beams}. Need an integer >= 1" # set max length for generation model.config.max_generate_length = data_args.val_max_target_length # set decoder_start_token_id for MBart if model.config.decoder_start_token_id is None and isinstance( tokenizer, MBartTokenizer): decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang] model.config.decoder_start_token_id = decoder_start_token_id def build_compute_metrics_fn( task_name: str) -> Callable[[EvalPrediction], Dict]: def non_pad_len(tokens: np.ndarray) -> int: return np.count_nonzero(tokens != tokenizer.pad_token_id) def decode_pred(pred: EvalPrediction) -> Tuple[List[str], List[str]]: pred_str = tokenizer.batch_decode(pred.predictions, skip_special_tokens=True) label_str = tokenizer.batch_decode(pred.label_ids, skip_special_tokens=True) pred_str = lmap(str.strip, pred_str) label_str = lmap(str.strip, label_str) return pred_str, label_str def summarization_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) rouge: Dict = calculate_rouge(pred_str, label_str) summ_len = np.mean(lmap(non_pad_len, pred.predictions)) rouge.update({"gen_len": summ_len}) return rouge def translation_metrics(pred: EvalPrediction) -> Dict: pred_str, label_str = decode_pred(pred) bleu: Dict = calculate_bleu(pred_str, label_str) gen_len = np.mean(lmap(non_pad_len, pred.predictions)) bleu.update({"gen_len": gen_len}) return bleu compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics return compute_metrics_fn def freeze_embeds(model: torch.nn.Module): """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5.""" try: freeze_params(model.model.shared) for d in [model.model.encoder, model.model.decoder]: freeze_params(d.embed_positions) freeze_params(d.embed_tokens) except AttributeError: freeze_params(model.shared) for d in [model.encoder, model.decoder]: freeze_params(d.embed_tokens) if model_args.freeze_embeds: freeze_embeds(model) if model_args.freeze_encoder: freeze_params(model.get_encoder()) assert_all_frozen(model.get_encoder()) dataset_class = Seq2SeqDataset if hasattr( tokenizer, "prepare_seq2seq_batch") else LegacySeq2SeqDataset # Get datasets train_dataset = (dataset_class( tokenizer, type_path="train", data_dir=data_args.data_dir, n_obs=data_args.n_train, max_target_length=data_args.max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_train else None) eval_dataset = (dataset_class( tokenizer, type_path="val", data_dir=data_args.data_dir, n_obs=data_args.n_val, max_target_length=data_args.val_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_eval or training_args.evaluation_strategy != EvaluationStrategy.NO else None) test_dataset = (dataset_class( tokenizer, type_path="test", data_dir=data_args.data_dir, n_obs=data_args.n_test, max_target_length=data_args.test_max_target_length, max_source_length=data_args.max_source_length, prefix=model.config.prefix or "", ) if training_args.do_predict else None) # Initialize our Trainer trainer = Seq2SeqTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=Seq2SeqDataCollator(tokenizer, data_args, training_args.tpu_num_cores), compute_metrics=build_compute_metrics_fn(data_args.task) if training_args.predict_with_generate else None, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.json") if trainer.is_world_process_zero(): logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) with open(output_eval_file, "w") as f: json.dump(result, f) eval_results.update(result) if training_args.do_predict: logging.info("*** Test ***") test_output = trainer.predict(test_dataset=test_dataset) test_metrics = test_output.metrics test_metrics = { k.replace("eval", "test"): v for k, v in test_metrics.items() } output_test_file = os.path.join(training_args.output_dir, "test_results.json") if trainer.is_world_process_zero(): logger.info("***** Test results *****") for key, value in test_metrics.items(): logger.info(" %s = %s", key, value) with open(output_test_file, "w") as f: json.dump(test_metrics, f) if training_args.predict_with_generate: test_preds = tokenizer.batch_decode(test_output.predictions, skip_special_tokens=True) test_preds = lmap(str.strip, test_preds) output_test_pred_file = os.path.join(training_args.output_dir, "test_generations.txt") with open(output_test_pred_file, "w") as f: f.write("\n".join(test_preds)) return eval_results