def run_per_ip(cmd, env_vars_for_mpi=None, use_devnull=False):
    if os.environ.get('OMPI_COMM_WORLD_SIZE') is not None:
        raise RuntimeError(
            "Function run_per_ip is not meant to be run from within an OpenMPI context. It is intended to invoke mpirun by itelf."
        )

    if not is_valid_multi_node_config():
        print(
            "************************* Single-HLS mode *************************"
        )
        run_cmd_as_subprocess(cmd, use_devnull)
    else:
        if os.environ.get('DOCKER_SSHD_PORT'):
            portnum = os.environ.get('DOCKER_SSHD_PORT')
        else:
            portnum = 3022
        scmd = f"mpirun --allow-run-as-root --mca plm_rsh_args -p{portnum} --tag-output --merge-stderr-to-stdout --prefix /usr/lib/habanalabs/openmpi/ -H {os.environ.get('MULTI_HLS_IPS')} "
        if env_vars_for_mpi is not None:
            for env_var in env_vars_for_mpi:
                scmd += f"-x {env_var} "
        scmd += cmd
        print(
            f"{socket.gethostname()}: In MULTI NODE run_per_ip(): scmd = {scmd}"
        )
        run_cmd_as_subprocess(scmd, use_devnull)
Пример #2
0
    def create_multi_worker_setup(self):
        assert self.use_horovod and self.num_workers_per_hls > 1, "Horovod run requires at least 2 workers"
        self.run_config_env_variables[
            'NUM_WORKERS_PER_HLS'] = f"{self.num_workers_per_hls}"
        tmp_dir = get_canonical_path("$HOME/tmp/")
        run_per_ip(f"mkdir -p {str(tmp_dir)}", ['MULTI_HLS_IPS', 'PYTHONPATH'],
                   False)
        print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}")

        # OpenMPI process bind resource type.
        mpi_map_by = "socket"

        # Get lscpu
        cmd = 'lscpu | grep \"CPU(s):\"'
        lscpu_output = []
        with subprocess.Popen(cmd,
                              shell=True,
                              executable='/bin/bash',
                              stdout=subprocess.PIPE,
                              stderr=subprocess.STDOUT) as proc:
            lscpu_output = proc.stdout.read()
        # Determine the optimal value of resources per process of OpenMPI binding based on local lscpu.
        if mpi_map_by == "socket":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls // 2
        elif mpi_map_by == "slot":
            mpi_map_by_pe = int(
                lscpu_output.split()[1]) // self.num_workers_per_hls
        else:
            raise Exception("mpi_map_by must be either 'socket' or 'slot'.")

        print(f"mpi_map_by_pe = {mpi_map_by_pe}")

        output_file_name = str(tmp_dir.joinpath("demo_bert_log/"))
        self.mpirun_cmd = "mpirun"
        self.mpirun_cmd += " --allow-run-as-root"
        self.mpirun_cmd += f" --tag-output --merge-stderr-to-stdout --output-filename {output_file_name}"

        if mpi_map_by_pe > 0:
            self.mpirun_cmd += f" --bind-to core --map-by {mpi_map_by}:PE={mpi_map_by_pe}"

        hcl_config_path = ''

        if is_valid_multi_node_config():
            hcl_config_path = self.create_multi_hls_setup(tmp_dir)
        else:
            hcl_config_path = self.create_single_hls_setup(tmp_dir)

        print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}")
        print(f"hcl_config_path = {hcl_config_path} ->")
        print_file_contents(hcl_config_path)

        os.environ['MPIRUN_CMD'] = self.mpirun_cmd
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}"
        )
        print(
            f"{self.__class__.__name__} create_multi_worker_setup(): MPIRUN_CMD = {os.environ.get('MPIRUN_CMD')}"
        )
 def prepare_results_path(self, results_dir):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             prepare_output_dir_path = Path(__file__).parent.parent.parent.joinpath('common').joinpath('prepare_output_dir.py')
             run_per_ip(f"python3 {str(prepare_output_dir_path)} {results_dir}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir.prepare_output_dir_r(results_dir)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} prepare_results_path({results_dir})") from exc
 def create_pretraining_data(self, seq_length, max_pred_per_seq):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             create_pt_data_path = Path(__file__).parent.joinpath('create_pretraining_data_overfit.py')
             run_per_ip(f"python3 {str(create_pt_data_path)} {self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             create_pretraining_data_overfit.create_pretraining_data_overfit_r(self.dataset_path, self.pretrained_model, seq_length, max_pred_per_seq)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} create_pretraining_data({self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq})") from exc
 def check_dirs(self, largs):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             check_dirs_path = Path(__file__).parent.parent.parent.joinpath(
                 'common').joinpath('check_dirs.py')
             run_per_ip(f"python3 {str(check_dirs_path)} {largs}",
                        ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             check_dirs.check_dirs_r(largs.split())
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} check_dirs(largs)"
         ) from exc
 def download_dataset(self):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             download_dataset_path = Path(__file__).parent.joinpath(
                 'download').joinpath('download_dataset.py')
             run_per_ip(
                 f"python3 {str(download_dataset_path)} {self.args.dataset_path}",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             download_dataset.download_dataset_r(self.args.dataset_path)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} download_dataset()"
         ) from exc
 def prepare_output_dir(self):
     try:
         if self.use_horovod and is_valid_multi_node_config():
             prepare_output_dir_squad_path = Path(__file__).parent.joinpath(
                 'prepare_output_dir_squad.py')
             run_per_ip(
                 f"python3 {str(prepare_output_dir_squad_path)} {self.args.output_dir} {self.batch_size} {self.max_seq_len}",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir_squad.prepare_output_dir_squad_r(
                 self.args.output_dir, self.batch_size, self.max_seq_len)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} prepare_output_dir()"
         ) from exc
Пример #8
0
 def download_pretrained_model(self, horovod_run):
     try:
         if horovod_run and is_valid_multi_node_config():
             download_pretrained_model_path = Path(
                 __file__).parent.joinpath('download').joinpath(
                     'download_pretrained_model.py')
             run_per_ip(
                 f"python3 {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False",
                 ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             download_pretrained_model.download_pretrained_model_r(
                 self.pretrained_url, self.pretrained_model, False)
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} download_pretrained_model()"
         ) from exc
def generate_mpi_hostfile(file_path):
    mpi_hostfile_path = ''
    if is_valid_multi_node_config():
        multi_hls_nodes = get_multi_node_config_nodes()
        print("Generating MPI hostfile...")
        file_name = "hostfile"
        os.makedirs(get_canonical_path(file_path), mode=0o777, exist_ok=True)
        mpi_hostfile_path = get_canonical_path(file_path).joinpath(file_name)
        if os.path.exists(mpi_hostfile_path):
            #os.remove(mpi_hostfile_path)
            cmd = f"rm -f {str(mpi_hostfile_path)}"
            run_cmd_as_subprocess(cmd)
        print(f"Path: {mpi_hostfile_path}")
        out_fid = open(mpi_hostfile_path, 'a')
        config_str = ''
        for node in multi_hls_nodes:
            config_str += f"{node} slots=8\n"
        print(f"MPI hostfile: \n{config_str}")
        out_fid.write(config_str)
        out_fid.close()
    return mpi_hostfile_path