def test_clm(self, stage): # this test exercises model.resize_token_embeddings() which requires param gathering outside # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py` data_dir = self.tests_dir / "fixtures" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {GPT2_TINY} --train_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt --output_dir {output_dir} --overwrite_output_dir --do_train --do_eval --max_train_samples 16 --max_eval_samples 16 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --num_train_epochs 1 --warmup_steps 8 --block_size 64 --fp16 --report_to none """.split() ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"] launcher = get_launcher(distributed=True) cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env())
def test_clm_from_config_zero3(self): # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called data_dir = self.tests_dir / "fixtures" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_type gpt2 --tokenizer_name sshleifer/tiny-gpt2 --train_file {data_dir}/sample_text.txt --validation_file {data_dir}/sample_text.txt --output_dir {output_dir} --overwrite_output_dir --do_train --max_train_samples 4 --per_device_train_batch_size 2 --num_train_epochs 1 --warmup_steps 8 --block_size 8 --fp16 --report_to none """.split() ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_zero3.json".split( ) script = [ f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py" ] launcher = self.get_launcher(distributed=True) cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die with CaptureStderr() as cs: execute_subprocess_async(cmd, env=self.get_env()) assert "Detected DeepSpeed ZeRO-3" in cs.err
def run_trainer( self, eval_steps: int, max_len: str, model_name: str, num_train_epochs: int, distributed: bool = True, extra_args_str: str = None, remove_args_str: str = None, ): data_dir = self.examples_dir / "test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --output_dir {output_dir} --overwrite_output_dir --max_train_samples 8 --max_val_samples 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --learning_rate 3e-3 --warmup_steps 8 --predict_with_generate --logging_steps 0 --save_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 --adafactor --task translation --target_lang ro_RO --source_lang en_XX """.split() if extra_args_str is not None: args.extend(extra_args_str.split()) if remove_args_str is not None: remove_args = remove_args_str.split() args = [x for x in args if x not in remove_args] ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split( ) script = [f"{self.examples_dir_str}/seq2seq/run_seq2seq.py"] num_gpus = get_gpu_count() if distributed else 1 launcher = f"deepspeed --num_gpus {num_gpus}".split() cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"PYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env()) return output_dir
def _run_finetune(self, gpus: int, distributed_retriever: str = "pytorch"): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) tmp_dir = self.get_auto_remove_tmp_dir() output_dir = os.path.join(tmp_dir, "output") data_dir = os.path.join(tmp_dir, "data") self._create_dummy_data(data_dir=data_dir) testargs = f""" --data_dir {data_dir} \ --output_dir {output_dir} \ --model_name_or_path facebook/rag-sequence-base \ --model_type rag_sequence \ --do_train \ --do_predict \ --n_val -1 \ --val_check_interval 1.0 \ --train_batch_size 2 \ --eval_batch_size 1 \ --max_source_length 25 \ --max_target_length 25 \ --val_max_target_length 25 \ --test_max_target_length 25 \ --label_smoothing 0.1 \ --dropout 0.1 \ --attention_dropout 0.1 \ --weight_decay 0.001 \ --adam_epsilon 1e-08 \ --max_grad_norm 0.1 \ --lr_scheduler polynomial \ --learning_rate 3e-04 \ --num_train_epochs 1 \ --warmup_steps 4 \ --gradient_accumulation_steps 1 \ --distributed-port 8787 \ --use_dummy_dataset 1 \ --distributed_retriever {distributed_retriever} \ """.split() if gpus > 0: testargs.append(f"--gpus={gpus}") if is_apex_available(): testargs.append("--fp16") else: testargs.append("--gpus=0") testargs.append("--distributed_backend=ddp_cpu") testargs.append("--num_processes=2") cmd = [sys.executable, str(Path(finetune_rag.__file__).resolve())] + testargs execute_subprocess_async(cmd, env=self.get_env()) metrics_save_path = os.path.join(output_dir, "metrics.json") with open(metrics_save_path) as f: result = json.load(f) return result
def run_trainer(self, eval_steps: int, max_len: str, model_name: str, num_train_epochs: int, distributed: bool = False): data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --data_dir {data_dir} --output_dir {output_dir} --overwrite_output_dir --n_train 8 --n_val 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --do_eval --do_predict --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --learning_rate 3e-3 --warmup_steps 8 --evaluation_strategy steps --predict_with_generate --logging_steps 0 --save_steps {str(eval_steps)} --eval_steps {str(eval_steps)} --sortish_sampler --label_smoothing 0.1 --adafactor --task translation --tgt_lang ro_RO --src_lang en_XX """.split() # --eval_beams 2 if distributed: n_gpu = get_gpu_count() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} {self.test_file_dir}/finetune_trainer.py """.split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) else: testargs = ["finetune_trainer.py"] + args with patch.object(sys, "argv", testargs): main() return output_dir
def _run_finetune(self, gpus: int, distributed_retriever: str = "pytorch"): stream_handler = logging.StreamHandler(sys.stdout) logger.addHandler(stream_handler) tmp_dir = self.get_auto_remove_tmp_dir() output_dir = os.path.join(tmp_dir, "output") data_dir = "/dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_generation_structure_two" # self._create_dummy_data(data_dir=data_dir) testargs = f""" --data_dir /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_generation_structure_two \ --output_dir output \ --index_name custom \ --index_path /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_knowledge_dataset-token-dpr_new/my_knowledge_dataset_hnsw_index.faiss \ --passages_path /dccstor/dialog/sfeng/projects/transformers_dialdoc/data_v2/dd_knowledge_dataset-token-dpr_new/my_knowledge_dataset \ --model_name_or_path facebook/rag-token-base \ --model_type rag_token \ --do_train \ --do_predict \ --n_val -1 \ --val_check_interval 1.0 \ --train_batch_size 4 \ --eval_batch_size 8 \ --max_source_length 25 \ --max_target_length 25 \ --val_max_target_length 25 \ --test_max_target_length 25 \ --label_smoothing 0.1 \ --dropout 0.1 \ --attention_dropout 0.1 \ --weight_decay 0.001 \ --adam_epsilon 1e-08 \ --max_grad_norm 0.1 \ --lr_scheduler polynomial \ --learning_rate 3e-04 \ --num_train_epochs 1 \ --warmup_steps 4 \ --gradient_accumulation_steps 1 \ --distributed-port 8787 \ --use_dummy_dataset 1 \ --distributed_retriever {distributed_retriever} \ """.split() if gpus > 0: testargs.append(f"--gpus={gpus}") if is_apex_available(): testargs.append("--fp16") else: testargs.append("--gpus=0") # testargs.append("--distributed_backend=ddp_cpu") testargs.append("--num_processes=1") cmd = [sys.executable, str(Path(finetune_rag.__file__).resolve())] + testargs execute_subprocess_async(cmd, env=self.get_env())
def test_trainer(self): distributed_args = f""" -m torch.distributed.launch --nproc_per_node={torch.cuda.device_count()} {self.test_file_dir}/test_trainer_distributed.py """.split() output_dir = self.get_auto_remove_tmp_dir() args = f"--output_dir {output_dir}".split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env())
def _test_distiller_cli_fork(self, updates, check_contents=True): default_updates = dict( label_smoothing=0.0, early_stopping_patience=-1, train_batch_size=1, eval_batch_size=2, max_epochs=2, alpha_mlm=0.2, alpha_ce=0.8, do_predict=True, model_name_or_path="sshleifer/tinier_bart", teacher=CHEAP_ARGS["model_name_or_path"], val_check_interval=0.5, ) default_updates.update(updates) args_d: dict = CHEAP_ARGS.copy() tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()) output_dir = self.get_auto_remove_tmp_dir() args_d.update(data_dir=tmp_dir, output_dir=output_dir, **default_updates) def convert(k, v): if k in ["tgt_suffix", "server_ip", "server_port", "out", "n_tpu_cores"]: return "" if v is False or v is None: return "" if v is True: # or len(str(v))==0: return f"--{k}" return f"--{k}={v}" cli_args = [x for x in (convert(k, v) for k, v in args_d.items()) if len(x)] cmd = [sys.executable, f"{self.test_file_dir}/distillation.py"] + cli_args execute_subprocess_async(cmd, env=self.get_env()) contents = os.listdir(output_dir) contents = {os.path.basename(p) for p in contents} ckpt_files = [p for p in contents if p.endswith("ckpt")] assert len(ckpt_files) > 0 self.assertIn("test_generations.txt", contents) self.assertIn("test_results.txt", contents) # get the following from the module, (we don't have access to `model` here) metrics_save_path = os.path.join(output_dir, "metrics.json") val_metric = "rouge2" metrics = load_json(metrics_save_path) # {'test': [{'test_avg_loss': 10.63731575012207, 'test_avg_rouge1': 0.0, 'test_avg_rouge2': 0.0, 'test_avg_rougeL': 0.0, 'test_avg_gen_time': 0.1822289228439331, 'test_avg_gen_len': 142.0, 'step_count': 1}]} print(metrics) last_step_stats = metrics["val"][-1] self.assertGreaterEqual(last_step_stats["val_avg_gen_time"], 0.01) self.assertIsInstance(last_step_stats[f"val_avg_{val_metric}"], float) self.assertEqual(len(metrics["test"]), 1) desired_n_evals = int(args_d["max_epochs"] * (1 / args_d["val_check_interval"]) / 2 + 1) self.assertEqual(len(metrics["val"]), desired_n_evals)
def run_trainer( self, stage: str, model_name: str, eval_steps: int = 10, num_train_epochs: int = 1, distributed: bool = True, fp16: bool = True, ): output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False) args = f""" --model_name_or_path {model_name} --dataset_name patrickvonplaten/librispeech_asr_dummy --dataset_config_name clean --train_split_name validation --validation_split_name validation --output_dir {output_dir} --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --evaluation_strategy steps --learning_rate 5e-4 --warmup_steps 8 --orthography timit --preprocessing_num_workers 1 --group_by_length --freeze_feature_extractor --report_to none --logging_steps 0 --save_steps 0 --eval_steps {eval_steps} --report_to none """.split() if fp16: args.extend(["--fp16"]) # currently ds_config_wav2vec2_zero.json requires "zero_optimization.find_unused_parameters": true, # hence the separate config files ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_wav2vec2_{stage}.json".split( ) script = [ f"{self.examples_dir_str}/research_projects/wav2vec2/run_asr.py" ] launcher = self.get_launcher(distributed) cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env()) return output_dir
def test_load_best_model(self, stage, dtype): # this test exercises --load_best_model_at_end - the key is being able to resume after some training data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {T5_TINY} --tokenizer_name {T5_TINY} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --output_dir {output_dir} --overwrite_output_dir --source_lang en --target_lang ro --do_train --max_train_samples 3 --do_eval --max_eval_samples 1 --logging_strategy steps --logging_steps 1 --evaluation_strategy steps --eval_steps 1 --save_strategy steps --save_steps 1 --load_best_model_at_end --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --num_train_epochs 1 --report_to none """.split() args.extend(["--source_prefix", "translate English to Romanian: "]) args.extend([f"--{dtype}"]) ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split( ) script = [ f"{self.examples_dir_str}/pytorch/translation/run_translation.py" ] launcher = get_launcher(distributed=False) cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die with CaptureStd() as cs: execute_subprocess_async(cmd, env=self.get_env()) # enough to test it didn't fail self.assertIn("DeepSpeed info", cs.out)
def test_zero_to_fp32(self, stage, task): # testing the ability to do a run followed by recovery of full fp32 weights cmd, output_dir = self.get_task_cmd(task, stage) # 1. generate the checkpoint cmd += "--save_steps 1".split() # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] + cmd)); die execute_subprocess_async(cmd, env=self.get_env()) # 2. test that the fp32 weights get reconsolidated chkpt_dir = f"{output_dir}/checkpoint-1" recovered_model_path = f"{chkpt_dir}/out.bin" cmd = f"{chkpt_dir}/zero_to_fp32.py {chkpt_dir} {recovered_model_path}" # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die subprocess.check_call(cmd, shell=True) assert os.path.exists( recovered_model_path), f"{recovered_model_path} was not found"
def test_distributed_eval(self): output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name Helsinki-NLP/opus-mt-en-ro --save_dir {output_dir} --data_dir {self.test_file_dir_str}/test_data/wmt_en_ro --num_beams 2 --task translation """.split() # we want this test to run even if there is only one GPU, but if there are more we use them all n_gpu = get_gpu_count() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} {self.test_file_dir}/run_distributed_eval.py """.split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) metrics_save_path = os.path.join(output_dir, "test_bleu.json") metrics = load_json(metrics_save_path) # print(metrics) self.assertGreaterEqual(metrics["bleu"], 25)
def run_trainer( self, eval_steps: int, max_len: int, model_name: str, num_train_epochs: int, learning_rate: float = 3e-3, distributed: bool = False, extra_args_str: str = None, predict_with_generate: bool = True, ): data_dir = self.examples_dir / "test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --test_file {data_dir}/test.json --output_dir {output_dir} --overwrite_output_dir --max_train_samples 8 --max_val_samples 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --do_eval --do_predict --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --learning_rate {learning_rate} --warmup_steps 8 --evaluation_strategy steps --logging_steps 0 --eval_steps {str(eval_steps)} --save_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 --adafactor --target_lang ro_RO --source_lang en_XX """ if predict_with_generate: args += "--predict_with_generate" args = args.split() if extra_args_str is not None: args.extend(extra_args_str.split()) if distributed: n_gpu = get_gpu_count() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} {self.examples_dir_str}/seq2seq/run_translation.py """.split() cmd = [sys.executable] + distributed_args + args execute_subprocess_async(cmd, env=self.get_env()) else: testargs = ["run_translation.py"] + args with patch.object(sys, "argv", testargs): main() return output_dir
def run_trainer( self, stage: str, model_name: str, eval_steps: int = 10, num_train_epochs: int = 1, do_train: bool = False, do_eval: bool = True, distributed: bool = True, fp16: bool = True, extra_args_str: str = None, remove_args_str: str = None, ): max_len = 32 data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --output_dir {output_dir} --overwrite_output_dir --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --warmup_steps 8 --predict_with_generate --logging_steps 0 --save_steps 0 --eval_steps {eval_steps} --group_by_length --label_smoothing_factor 0.1 --source_lang en --target_lang ro --report_to none """.split() args.extend(["--source_prefix", '"translate English to Romanian: "']) if fp16: args.extend(["--fp16"]) actions = 0 if do_train: actions += 1 args.extend(f""" --do_train --num_train_epochs {str(num_train_epochs)} --max_train_samples 16 --per_device_train_batch_size 2 --learning_rate 3e-3 """.split()) if do_eval: actions += 1 args.extend(""" --do_eval --max_eval_samples 16 --per_device_eval_batch_size 2 """.split()) assert actions > 0, "need at least do_train or do_eval for the test to run" if extra_args_str is not None: args.extend(extra_args_str.split()) # currently only works for bool args if remove_args_str is not None: remove_args = remove_args_str.split() args = [x for x in args if x not in remove_args] ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split( ) script = [ f"{self.examples_dir_str}/pytorch/translation/run_translation.py" ] launcher = self.get_launcher(distributed) cmd = launcher + script + args + ds_args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env()) return output_dir
def run_trainer( self, eval_steps: int, max_len: int, model_name: str, num_train_epochs: int, learning_rate: float = 3e-3, distributed: bool = False, extra_args_str: str = None, predict_with_generate: bool = True, do_train: bool = True, do_eval: bool = True, do_predict: bool = True, ): data_dir = self.test_file_dir / "../fixtures/tests_samples/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args_train = f""" --model_name_or_path {model_name} --train_file {data_dir}/train.json --validation_file {data_dir}/val.json --test_file {data_dir}/test.json --output_dir {output_dir} --overwrite_output_dir --max_train_samples 8 --max_source_length {max_len} --max_target_length {max_len} --do_train --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --learning_rate {learning_rate} --warmup_steps 8 --logging_steps 0 --logging_strategy no --save_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 --adafactor --target_lang ro_RO --source_lang en_XX """ args_eval = f""" --do_eval --per_device_eval_batch_size 4 --max_eval_samples 8 --val_max_target_length {max_len} --evaluation_strategy steps --eval_steps {str(eval_steps)} """ args_predict = """ --do_predict """ args = "" if do_train: args += args_train if do_eval: args += args_eval if do_predict: args += args_predict if predict_with_generate: args += "--predict_with_generate" args = args.split() if extra_args_str is not None: args.extend(extra_args_str.split()) if distributed: n_gpu = get_gpu_count() master_port = get_torch_dist_unique_port() distributed_args = f""" -m torch.distributed.launch --nproc_per_node={n_gpu} --master_port={master_port} {self.examples_dir_str}/pytorch/translation/run_translation.py """.split() cmd = [sys.executable] + distributed_args + args # keep for quick debug # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die execute_subprocess_async(cmd, env=self.get_env()) else: testargs = ["run_translation.py"] + args with patch.object(sys, "argv", testargs): main() return output_dir
def run_trainer( self, eval_steps: int, max_len: str, model_name: str, num_train_epochs: int, distributed: bool = False, extra_args_str: str = None, remove_args_str: str = None, ): data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro" output_dir = self.get_auto_remove_tmp_dir() args = f""" --model_name_or_path {model_name} --data_dir {data_dir} --output_dir {output_dir} --overwrite_output_dir --n_train 8 --n_val 8 --max_source_length {max_len} --max_target_length {max_len} --val_max_target_length {max_len} --do_train --do_eval --do_predict --num_train_epochs {str(num_train_epochs)} --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --learning_rate 3e-3 --warmup_steps 8 --evaluation_strategy steps --predict_with_generate --logging_steps 0 --save_steps {str(eval_steps)} --eval_steps {str(eval_steps)} --group_by_length --label_smoothing_factor 0.1 --adafactor --task translation --tgt_lang ro_RO --src_lang en_XX """.split() # --eval_beams 2 if extra_args_str is not None: args.extend(extra_args_str.split()) if remove_args_str is not None: remove_args = remove_args_str.split() args = [x for x in args if x not in remove_args] ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split( ) distributed_args = f""" {self.test_file_dir}/../../seq2seq/finetune_trainer.py """.split() cmd = ["deepspeed"] + distributed_args + args + ds_args # keep for quick debug # print(" ".join(cmd)); die execute_subprocess_async(cmd, env=self.get_env()) return output_dir