def create_pretraining_data_overfit_r(dataset_path, pretrained_model, seq_length, max_pred_per_seq): host_name = socket.gethostname() try: ds_path = get_canonical_path(dataset_path) if not os.path.isdir(ds_path): os.makedirs(ds_path, mode=0o777, exist_ok=True) run_create_pretraining_path = Path(__file__).parent.joinpath('create_pretraining_data.py') input_file_path = Path(__file__).parent.joinpath("sample_text.txt") output_file_path = ds_path.joinpath("tf_examples.tfrecord") pretrained_model_path = get_canonical_path(pretrained_model) vocab_file_path = pretrained_model_path.joinpath("vocab.txt") command = ( f"python3 {str(run_create_pretraining_path)}" f" --input_file={str(input_file_path)}" f" --output_file={str(output_file_path)}" f" --vocab_file={str(vocab_file_path)}" f" --do_lower_case=True" f" --max_seq_length={seq_length}" f" --max_predictions_per_seq={max_pred_per_seq}" f" --masked_lm_prob=0.15" f" --random_seed=12345" f" --dupe_factor=5" ) print(f"{host_name}: {__file__}: create_pretraining_data_overfit_r() command = {command}") sys.stdout.flush() sys.stderr.flush() with subprocess.Popen(command, shell=True, executable='/bin/bash') as proc: proc.wait() except Exception as exc: raise Exception(f"{host_name}: Error in {__file__} create_pretraining_data_overfit_r({dataset_path}, {pretrained_model}, {seq_length}, {max_pred_per_seq})") from exc
def build_command(self): try: run_classifier_path = Path(__file__).parent.joinpath( 'run_classifier.py') pretrained_model_path = get_canonical_path(self.pretrained_model) use_horovod_str = "true" if self.use_horovod else "false" vocab_path = str(pretrained_model_path.joinpath("vocab.txt")) bcfg_path = str(pretrained_model_path.joinpath("bert_config.json")) ic_path = str(pretrained_model_path.joinpath("bert_model.ckpt")) print( f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}" ) if self.mpirun_cmd == '': init_command = f"time python3 {str(run_classifier_path)}" else: init_command = f"time {self.mpirun_cmd} python3 {str(run_classifier_path)}" self.command = ( f"{init_command}" f" --task_name=MRPC --do_train=true --do_eval=true --data_dir={get_canonical_path_str(self.args.dataset_path)}" f" --vocab_file={vocab_path}" f" --bert_config_file={bcfg_path}" f" --init_checkpoint={ic_path}" f" --max_seq_length={self.max_seq_len}" f" --train_batch_size={self.batch_size}" f" --learning_rate={self.args.learning_rate}" f" --num_train_epochs={self.epochs}" f" --output_dir={get_canonical_path_str(self.args.output_dir)}" f" --use_horovod={use_horovod_str}") print('bert_mrpc_utils::self.command = ', self.command) except Exception as exc: raise RuntimeError( f"Error in {self.__class__.__name__} build_command()") from exc
def __init__(self, args): self.args = args self.use_horovod = False self.num_workers_per_hls = 1 self.scaleout = False self.hls_ips = '' self.mpirun_cmd = '' if self.args.use_horovod is not None: self.use_horovod = True self.num_workers_per_hls = self.args.use_horovod self.scaleout = True self.num_workers_total = self.num_workers_per_hls print( f"use_horovod = {self.use_horovod}, num_workers_per_hls = {self.num_workers_per_hls}" ) self.run_config_env_variables = {} os.makedirs(get_canonical_path("$HOME/tmp/"), mode=0o777, exist_ok=True) if self.use_horovod: self.create_multi_worker_setup() else: self.create_single_worker_setup()
def create_multi_worker_setup(self): assert self.use_horovod and self.num_workers_per_hls > 1, "Horovod run requires at least 2 workers" self.run_config_env_variables[ 'NUM_WORKERS_PER_HLS'] = f"{self.num_workers_per_hls}" tmp_dir = get_canonical_path("$HOME/tmp/") run_per_ip(f"mkdir -p {str(tmp_dir)}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False) print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}") # OpenMPI process bind resource type. mpi_map_by = "socket" # Get lscpu cmd = 'lscpu | grep \"CPU(s):\"' lscpu_output = [] with subprocess.Popen(cmd, shell=True, executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.STDOUT) as proc: lscpu_output = proc.stdout.read() # Determine the optimal value of resources per process of OpenMPI binding based on local lscpu. if mpi_map_by == "socket": mpi_map_by_pe = int( lscpu_output.split()[1]) // self.num_workers_per_hls // 2 elif mpi_map_by == "slot": mpi_map_by_pe = int( lscpu_output.split()[1]) // self.num_workers_per_hls else: raise Exception("mpi_map_by must be either 'socket' or 'slot'.") print(f"mpi_map_by_pe = {mpi_map_by_pe}") output_file_name = str(tmp_dir.joinpath("demo_bert_log/")) self.mpirun_cmd = "mpirun" self.mpirun_cmd += " --allow-run-as-root" self.mpirun_cmd += f" --tag-output --merge-stderr-to-stdout --output-filename {output_file_name}" if mpi_map_by_pe > 0: self.mpirun_cmd += f" --bind-to core --map-by {mpi_map_by}:PE={mpi_map_by_pe}" hcl_config_path = '' if is_valid_multi_node_config(): hcl_config_path = self.create_multi_hls_setup(tmp_dir) else: hcl_config_path = self.create_single_hls_setup(tmp_dir) print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}") print(f"hcl_config_path = {hcl_config_path} ->") print_file_contents(hcl_config_path) os.environ['MPIRUN_CMD'] = self.mpirun_cmd print( f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}" ) print( f"{self.__class__.__name__} create_multi_worker_setup(): MPIRUN_CMD = {os.environ.get('MPIRUN_CMD')}" )
def generate_mpi_hostfile(file_path): mpi_hostfile_path = '' if is_valid_multi_node_config(): multi_hls_nodes = get_multi_node_config_nodes() print("Generating MPI hostfile...") file_name = "hostfile" os.makedirs(get_canonical_path(file_path), mode=0o777, exist_ok=True) mpi_hostfile_path = get_canonical_path(file_path).joinpath(file_name) if os.path.exists(mpi_hostfile_path): #os.remove(mpi_hostfile_path) cmd = f"rm -f {str(mpi_hostfile_path)}" run_cmd_as_subprocess(cmd) print(f"Path: {mpi_hostfile_path}") out_fid = open(mpi_hostfile_path, 'a') config_str = '' for node in multi_hls_nodes: config_str += f"{node} slots=8\n" print(f"MPI hostfile: \n{config_str}") out_fid.write(config_str) out_fid.close() return mpi_hostfile_path
def prepare_output_dir_r(output_dir): host_name = socket.gethostname() try: od_path = get_canonical_path(output_dir) if os.path.isdir(od_path): print(f"{host_name}: *** Cleaning existing {str(od_path)}...\n\n") #shutil.rmtree(od_path) cmd = f"rm -rf {get_canonical_path_str(od_path)}" run_cmd_as_subprocess(cmd) os.makedirs(od_path, mode=0o777, exist_ok=True) except Exception as exc: raise Exception(f"{host_name}: Error in {__file__} prepare_output_dir_r({output_dir})") from exc
def prepare_output_dir_squad_r(output_dir, batch_size, max_seq_len): host_name = socket.gethostname() try: od_path = get_canonical_path(output_dir) route0 = 0 route1 = 0 if os.path.isdir(od_path): cfg_path = os.fspath(od_path) + ("/") + ( f"last_config_{batch_size}_{max_seq_len}") if os.path.exists(cfg_path): route0 = 1 else: route1 = 1 else: os.makedirs(od_path, exist_ok=True) if route0 == 1: print( f"{host_name}: *** Cleaning temp directory content in {output_dir}... (except *.tf_record files) \n\n" ) with os.scandir(od_path) as it: for entry in it: if entry.is_file(): if Path(entry.name).suffix != '.tf_record': #os.remove(Path(entry.path)) cmd = f"rm -f {get_canonical_path_str(entry.path)}" run_cmd_as_subprocess(cmd) elif entry.is_dir(): #shutil.rmtree(get_canonical_path(entry.path)) cmd = f"rm -rf {get_canonical_path_str(entry.path)}" run_cmd_as_subprocess(cmd) if route1 == 1: print( f"{host_name}: *** Cleaning temp directory content in {output_dir}... \n\n" ) # This throws an exception when remote hosts share the same file system paths #shutil.rmtree(od_path) cmd = f"rm -rf {get_canonical_path_str(od_path)}" run_cmd_as_subprocess(cmd) os.makedirs(od_path, exist_ok=True) os.open(get_canonical_path_str(output_dir) + ("/") + (f"last_config_{batch_size}_{max_seq_len}"), os.O_CREAT, mode=0o644) except Exception as exc: raise Exception( f"{host_name}: Error in {__file__} prepare_output_dir_squad_r({output_dir}, {batch_size}, {max_seq_len})" ) from exc
def download_pretrained_model_r(pretrained_url, pretrained_model, flatten_archive=False): host_name = socket.gethostname() this_dir = get_canonical_path(os.curdir) try: os.chdir(Path(__file__).parent.parent) if not os.path.isdir(pretrained_model): _wget = False if os.path.exists(pretrained_model + ".zip") == False: _wget = True else: if os.path.getsize(pretrained_model + ".zip") == 0: print( f"{host_name}: *** Broken file, needs download ...\n\n" ) _wget = True if _wget == True: print(f"{host_name}: *** Downloading pre-trained model...\n\n") inf = urllib.request.urlopen(pretrained_url + pretrained_model + ".zip") with open(pretrained_model + ".zip", "wb") as outf: outf.write(inf.read()) print(f"{host_name}: *** Extracting pre-trained model...\n\n") with zipfile.ZipFile(pretrained_model + ".zip", 'r') as zip_ref: if flatten_archive: # large model is zipped with subdirectory, flatten archive tree structure for member in zip_ref.infolist(): # skip directories if member.is_dir(): continue zip_ref.extract(member) else: zip_ref.extractall(pretrained_model) if _wget == True: #os.remove(pretrained_model + ".zip") cmd = f"rm -f {pretrained_model}.zip" run_cmd_as_subprocess(cmd) else: print( f"{host_name}: Reusing existing pre-trained model directory \'{pretrained_model}\'" ) os.chdir(this_dir) except Exception as exc: os.chdir(this_dir) raise Exception( f"{host_name}: Error in {__file__} download_pretrained_model()" ) from exc
def __init__(self, modelname, filename): self.model = modelname self.hb_config = filename self.parsed_config = None self.env_variables = None self.model_parameters = None self.model_parameters_store_true = None config_path = get_canonical_path(self.hb_config) if config_path.is_file() is False: raise OSError( f"hb_config has to be existing yaml file, but there is no file {config_path}" ) self.process_config_file(config_path)
def download_dataset_r(dataset_path): host_name = socket.gethostname() try: ds_path = get_canonical_path(dataset_path) if not os.path.isdir(ds_path): print(f"{host_name}: *** Downloading dataset...\n\n") os.makedirs(ds_path, exist_ok=True) download_script = Path(__file__).parent.joinpath( "download_glue_data.py") sys.stdout.flush() sys.stderr.flush() with subprocess.Popen( f"python3 {str(download_script)} --data_dir {str(ds_path.parent)} --tasks MRPC", shell=True, executable='/bin/bash') as proc: proc.wait() except Exception as exc: raise Exception( f"{host_name}: Error in {__file__} download_dataset_r({dataset_path})" ) from exc
def build_for_pretraining_lamb_phase2(self): try: pretrained_model_path = get_canonical_path(self.pretrained_model) bert_config = str( pretrained_model_path.joinpath("bert_config.json")) PREC = self.set_PREC() horovod_str = "--horovod" if self.args.use_horovod is not None else "" #PHASE 1 Config gbs_phase1 = self.p1_batch_size * self.num_acc_steps_phase1 PHASE1_CKPT = get_canonical_path(self.results_dir).joinpath( "phase_1").joinpath(f"model.ckpt-{self.p1_steps}") #PHASE 2 seq_len = self.p2_max_seq_len max_pred_per_seq = 80 gbs_phase2 = self.p2_batch_size * self.num_acc_steps_phase2 if self.args.fast_perf_only != 1: # Adjust for batch size self.p2_steps = int((self.p2_steps * gbs_phase1) / gbs_phase2) results_dir_phase2 = self.results_dir + "/" + "phase_2" # run_per_ip results_phase2_path = get_canonical_path(results_dir_phase2) self.prepare_output_dir(results_dir_phase2) input_files_path = get_canonical_path( self.args.dataset_path).joinpath( f"seq_len_{seq_len}").joinpath("books_wiki_en_corpus/") # run_per_ip dir_list = "" dir_list += str(input_files_path) dir_list += " " dir_list += str(results_dir_phase2) dir_list += " " dir_list += bert_config dir_list += " " dir_list += f"{str(PHASE1_CKPT)}.meta" self.check_dirs(dir_list) input_files_dir = str(input_files_path.joinpath("training")) eval_files_dir = str(input_files_path.joinpath("test")) dllog_path = str(results_phase2_path.joinpath("bert_dllog.json")) run_pretraining_path = Path(__file__).parent.joinpath( "pretraining").joinpath('run_pretraining.py') """ if os.environ.get('MPIRUN_CMD') is not None: mpirun_cmd = str(os.environ.get('MPIRUN_CMD')) else: mpirun_cmd = '' """ print( f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}" ) if self.mpirun_cmd == '': init_command = f"time python3 {str(run_pretraining_path)}" else: init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}" self.command = ( f"{init_command}" f" --input_files_dir={input_files_dir}" f" --init_checkpoint={str(PHASE1_CKPT)}" f" --eval_files_dir={eval_files_dir}" f" --output_dir={str(results_phase2_path)}" f" --bert_config_file={bert_config}" f" --do_train=True" f" --do_eval=False" f" --train_batch_size={self.p2_batch_size}" f" --eval_batch_size={self.eval_batch_size}" f" --max_seq_length={seq_len}" f" --max_predictions_per_seq={max_pred_per_seq}" f" --num_train_steps={self.p2_steps}" f" --num_accumulation_steps={self.num_acc_steps_phase2}" f" --num_warmup_steps={self.p2_warmup}" f" --save_checkpoints_steps={self.save_checkpoints_steps}" f" --learning_rate={self.learning_rate_phase2}" f" {horovod_str} {PREC}" f" --allreduce_post_accumulation=True" f" --dllog_path={dllog_path}") print( "-------------------------------------------------------------------------\n" ) print( "Running the Pre-Training :: Phase 2: Next Sentence Prediction\n" ) print( "-------------------------------------------------------------------------" ) print( 'bert_pretraining_bookswiki_utils::self.command for Phase2 = ', self.command) except Exception as exc: raise RuntimeError( f"Error in {self.__class__.__name__} build_for_pretraining_lamb_phase2()" ) from exc
def build_for_pretraining_lamb_phase1(self): try: pretrained_model_path = get_canonical_path(self.pretrained_model) bert_config = str( pretrained_model_path.joinpath("bert_config.json")) PREC = self.set_PREC() horovod_str = "--horovod" if self.args.use_horovod is not None else "" #PHASE 1 gbs_phase1 = self.p1_batch_size * self.num_acc_steps_phase1 seq_len = self.p1_max_seq_len max_pred_per_seq = 20 results_dir_phase1 = self.results_dir + "/" + "phase_1" # run_per_ip results_phase1_path = get_canonical_path(results_dir_phase1) self.prepare_output_dir(results_dir_phase1) input_files_path = get_canonical_path( self.args.dataset_path).joinpath( f"seq_len_{seq_len}").joinpath("books_wiki_en_corpus/") # run_per_ip dir_list = "" dir_list += str(input_files_path) dir_list += " " dir_list += results_dir_phase1 dir_list += " " dir_list += bert_config self.check_dirs(dir_list) input_files_dir = str(input_files_path.joinpath("training")) eval_files_dir = str(input_files_path.joinpath("test")) dllog_path = str(results_phase1_path.joinpath("bert_dllog.json")) run_pretraining_path = Path(__file__).parent.joinpath( "pretraining").joinpath('run_pretraining.py') print( f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}" ) if self.mpirun_cmd == '': init_command = f"time python3 {str(run_pretraining_path)}" else: init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}" self.command = ( f"{init_command}" f" --input_files_dir={input_files_dir}" f" --eval_files_dir={eval_files_dir}" f" --output_dir={str(results_phase1_path)}" f" --bert_config_file={bert_config}" f" --do_train=True" f" --do_eval=False" f" --train_batch_size={self.p1_batch_size}" f" --eval_batch_size={self.eval_batch_size}" f" --max_seq_length={seq_len}" f" --max_predictions_per_seq={max_pred_per_seq}" f" --num_train_steps={self.p1_steps}" f" --num_accumulation_steps={self.num_acc_steps_phase1}" f" --num_warmup_steps={self.p1_warmup}" f" --save_checkpoints_steps={self.save_checkpoints_steps}" f" --learning_rate={self.learning_rate_phase1}" f" {horovod_str} {PREC}" f" --allreduce_post_accumulation=True" f" --dllog_path={dllog_path}") print( "-------------------------------------------------------------------------\n" ) print( "Running the Pre-Training :: Phase 1: Masked Language Model\n") print( "-------------------------------------------------------------------------" ) print( 'bert_pretraining_bookswiki_utils::self.command for Phase1 = ', self.command) except Exception as exc: raise RuntimeError( f"Error in {self.__class__.__name__} build_for_pretraining_lamb_phase1()" ) from exc
def build_command(self): try: seq_length = self.p1_max_seq_len if seq_length == 128: max_pred_per_seq = 20 elif seq_length == 512: max_pred_per_seq = 80 else: print(f"Warning: Unsupported max_sequence_length {seq_length}. Setting max_predictions_per_seq to floor(0.15*max_sequence_length). Please see -s parameter for details") max_pred_per_seq = math.floor(0.15 * seq_length) # run_per_ip self.create_pretraining_data(seq_length, max_pred_per_seq) sys.stdout.flush() sys.stderr.flush() horovod_str = "--horovod" if self.args.use_horovod is not None else "" # run_per_ip self.prepare_results_path(self.results_dir) base_lr = 0.006 num_acc_steps = 1 learning_rate = float(base_lr * ( self.p1_batch_size * self.num_workers_total * num_acc_steps ) / 65536) print(f"learning_rate = {learning_rate}") ds_path = str(get_canonical_path(self.dataset_path)) results_path = get_canonical_path(self.results_dir) pretrained_model_path = get_canonical_path(self.pretrained_model) bert_config = str(pretrained_model_path.joinpath("bert_config.json")) init_checkpoint_path = get_canonical_path(self.args.init_checkpoint_path).joinpath(f"{self.args.model_variant}").joinpath("model.ckpt-0.meta") init_checkpoint = str(get_canonical_path(self.args.init_checkpoint_path).joinpath(f"{self.args.model_variant}")) init_checkpoint = init_checkpoint + "/" + "model.ckpt-0" if os.path.exists(init_checkpoint_path) == False: raise Exception(f"Error: init_checkpoint_path {str(init_checkpoint_path)} file or directory missing. Please mount correctly") dllog_path = str(results_path.joinpath("bert_dllog.json")) run_pretraining_path = Path(__file__).parent.joinpath("pretraining").joinpath('run_pretraining.py') print(f"{self.__class__.__name__}: self.mpirun_cmd = {self.mpirun_cmd}") if self.mpirun_cmd == '': init_command = f"time python3 {str(run_pretraining_path)}" else: init_command = f"time {self.mpirun_cmd} python3 {str(run_pretraining_path)}" self.command = ( f"{init_command}" f" --input_files_dir={ds_path}" f" --eval_files_dir={ds_path}" f" --output_dir={str(results_path)}" f" --do_train=True" f" --do_eval=True" f" --bert_config_file={bert_config}" f" --init_checkpoint={init_checkpoint}" f" --train_batch_size={self.p1_batch_size}" f" --eval_batch_size={self.eval_batch_size}" f" --max_seq_length={seq_length}" f" --max_predictions_per_seq={max_pred_per_seq}" f" --num_train_steps={self.p1_steps}" f" --num_accumulation_steps={num_acc_steps}" f" --num_warmup_steps={self.p1_warmup}" f" --dllog_path={dllog_path}" f" --learning_rate={learning_rate}" f" {horovod_str}" f" --amp=False" f" --use_xla=False" ) print('bert_pretraining_overfit_utils build_command(): self.command = ', self.command) except Exception as exc: raise RuntimeError(f"Error in {self.__class__.__name__} build_command()") from exc