Exemplo n.º 1
0
 def download_dataset(self):
     try:
         if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run:
             download_dataset_path = Path(__file__).parent.joinpath('download').joinpath('download_dataset.py')
             run_per_ip(f"{sys.executable} {str(download_dataset_path)} {self.args.dataset_path}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             download_dataset.download_dataset_r(self.args.dataset_path)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} download_dataset()") from exc
 def prepare_results_path(self, results_dir):
     try:
         if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run:
             prepare_output_dir_path = Path(__file__).parent.parent.parent.parent.parent.joinpath('central').joinpath('prepare_output_dir.py')
             run_per_ip(f"{sys.executable} {str(prepare_output_dir_path)} {results_dir}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir.prepare_output_dir_r(results_dir)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} prepare_results_path({results_dir})") from exc
Exemplo n.º 3
0
 def prepare_output_dir(self):
     try:
         if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run:
             prepare_output_dir_squad_path = Path(__file__).parent.joinpath('prepare_output_dir_squad.py')
             run_per_ip(f"{sys.executable} {str(prepare_output_dir_squad_path)} {self.args.output_dir} {self.batch_size} {self.max_seq_len}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             prepare_output_dir_squad.prepare_output_dir_squad_r(self.args.output_dir, self.batch_size, self.max_seq_len)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} prepare_output_dir()") from exc
Exemplo n.º 4
0
 def create_pretraining_data(self, seq_length, max_pred_per_seq):
     try:
         if self.scaleout and is_valid_multi_node_config() and not self.kubernetes_run:
             create_pt_data_path = Path(__file__).parent.joinpath('data_preprocessing').joinpath('create_pretraining_data_overfit.py')
             run_per_ip(f"{sys.executable} {str(create_pt_data_path)} {self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq}", ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             create_pretraining_data_overfit.create_pretraining_data_overfit_r(self.dataset_path, self.pretrained_model, seq_length, max_pred_per_seq)
     except Exception as exc:
         raise RuntimeError(f"Error in {self.__class__.__name__} create_pretraining_data({self.dataset_path} {self.pretrained_model} {seq_length} {max_pred_per_seq})") from exc
Exemplo n.º 5
0
 def download_pretrained_model(self, horovod_run):
   try:
     download_pretrained_model_path = Path(__file__).parent.joinpath(
           'download').joinpath('download_pretrained_model.py')
     if horovod_run and is_valid_multi_node_config():
       run_per_ip(f"{sys.executable} {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False", [
                  'MULTI_HLS_IPS', 'PYTHONPATH'], False)
     else:
       run_cmd_as_subprocess(f"{sys.executable} {str(download_pretrained_model_path)} {self.pretrained_url} {self.pretrained_model} False")
   except Exception as exc:
     raise RuntimeError(f"Error in {self.__class__.__name__} download_pretrained_model()") from exc
 def check_dirs(self, largs):
     try:
         if self.scaleout and is_valid_multi_node_config(
         ) and not self.kubernetes_run:
             check_dirs_path = Path(
                 __file__).parent.parent.parent.parent.joinpath(
                     'central').joinpath('check_dirs.py')
             run_per_ip(f"{sys.executable} {str(check_dirs_path)} {largs}",
                        ['MULTI_HLS_IPS', 'PYTHONPATH'], False)
         else:
             check_dirs.check_dirs_r(largs.split())
     except Exception as exc:
         raise RuntimeError(
             f"Error in {self.__class__.__name__} check_dirs(largs)"
         ) from exc
Exemplo n.º 7
0
    def setup_config_env(self):
        print(f"self.world_size = {self.__world_size}")

        tmp_dir = '/tmp'
        __worker_per_node = self.__world_size
        gen_hcl_config = True
        # Dont generate HCL config in below scenarios
        # HCCL host NIC scaling is enabled.i.e.  "HCCL_OVER_TCP" is 1/True
        # HCCL libfabric host NIC scaling is enabled.i.e.  "HCCL_OVER_OFI" is 1/True
        # HCL_CONFIG_PATH is already set
        hccl_over_tcp = os.getenv("HCCL_OVER_TCP")
        hccl_over_ofi = os.getenv("HCCL_OVER_OFI")
        if hccl_over_tcp or hccl_over_ofi:
            if hccl_over_tcp:
                hccl_over_tcp = hccl_over_tcp.lower() in ["1", "true"]
            if hccl_over_ofi:
                hccl_over_ofi = hccl_over_ofi.lower() in ["1", "true"]
            print(f"HCCL_OVER_TCP={os.getenv('HCCL_OVER_TCP')}")
            print(f"HCCL_OVER_OFI={os.getenv('HCCL_OVER_OFI')}")
            if hccl_over_tcp or hccl_over_ofi:
                print("skiping HCL config generation")
                gen_hcl_config = False
        if os.getenv("HCL_CONFIG_PATH"):
            print("HCL_CONFIG_PATH is already set")
            print("skiping HCL config generation")
            gen_hcl_config = False
        if self.__multi_hls:
            __cnt = len(os.getenv("MULTI_HLS_IPS").split(','))
            gen_hcl_path = Path(__file__).parent.parent.parent.joinpath(
                'central/generate_hcl_config.py')
            # Create HCL config on each remote IP.
            if gen_hcl_config:
                __worker_per_node = self.__world_size // __cnt
                run_per_ip((f"{sys.executable} {str(gen_hcl_path)} {tmp_dir} "
                            f"{__worker_per_node} {self.___hls_type}"),
                           ['MULTI_HLS_IPS', 'PYTHONPATH'], False)

        if gen_hcl_config and self.__world_size > 1:
            # HCL_CONFIG_PATH env var is set in generate_hcl_config_r()
            generate_hcl_config.generate_hcl_config_unless_hccl(
                f'{tmp_dir}', __worker_per_node, hls_type=self.___hls_type)
        print(
            f"HLS ({self.__world_size}): HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}"
        )
    def create_multi_hls_setup(self, tmp_dir):
        #
        # Multi-HLS Mode
        #
        gen_hcl_path = Path(__file__).parent.joinpath('generate_hcl_config.py')
        # Create HCL config on each remote IP.
        run_per_ip(f"{sys.executable} {str(gen_hcl_path)} {str(tmp_dir)} {self.num_workers_per_hls} {self.hls_type}", [
                   'MULTI_HLS_IPS', 'PYTHONPATH', 'HOROVOD_HIERARCHICAL_ALLREDUCE'], False)

        # Set HCL_CONFIG_PATH in this script, so it can be propagated in self.mpirun_cmd to remote IPs.
        hcl_config_path = generate_hcl_config.generate_hcl_config_unless_hccl(
            str(tmp_dir), self.num_workers_per_hls, self.hls_type)

        multi_hls_nodes = get_multi_node_config_nodes()
        self.num_workers_total = len(
            multi_hls_nodes) * self.num_workers_per_hls
        print(f"self.num_workers_total = {self.num_workers_total}")
        print(
            f"++++++++++ Multi-HLS ({self.num_workers_total}-cards): effective HCL_CONFIG_PATH = {hcl_config_path}")

        mpi_hostfile_path = generate_mpi_hostfile(
            str(tmp_dir), self.num_workers_per_hls)
        assert mpi_hostfile_path != '', "Don\'t have a valid mpi_hostfile_path for MULTI_HLS_IPS scenario"
        print(f"mpi_hostfile_path = {mpi_hostfile_path} ->")
        print_file_contents(mpi_hostfile_path)

        self.mpirun_cmd += f" -np {self.num_workers_total}"
        if os.environ.get('DOCKER_SSHD_PORT'):
            portnum = os.environ.get('DOCKER_SSHD_PORT')
        else:
            portnum = 3022
        self.mpirun_cmd += f" --mca plm_rsh_args -p{portnum}"
        self.mpirun_cmd += f" --mca btl_tcp_if_include {get_mpi_tcp_include()}"
        self.mpirun_cmd += f" -hostfile {mpi_hostfile_path}"
        self.mpirun_cmd += " --prefix $MPI_ROOT"

        for env_var in get_relevant_env_vars():
            self.mpirun_cmd += f" -x {env_var}={shlex.quote(os.environ[env_var])}"
            # Note that =value above in not necessary, but provides a vital information when presented this way in the log file.

        return hcl_config_path
    def create_multi_worker_setup(self):
        if not self.kubernetes_run:
            assert self.scaleout and self.num_workers_per_hls > 1, "Scaleout run requires at least 2 workers"
        tmp_dir = Path(os.path.expandvars(os.path.expanduser("$HOME/tmp/")))
        run_per_ip(f"mkdir -p {str(tmp_dir)}",
                   ['MULTI_HLS_IPS', 'PYTHONPATH'], False, self.kubernetes_run)
        hcl_config_path = ''

        if self.kubernetes_run:
            hcl_config_path = Path(
                os.environ.get('HCL_CONFIG_PATH'))

            # Printing env var HCL_CONFIG_PATH has been proven to be misleading.
            # print(
            #     f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}")

            print(f"Effective HCL_CONFIG_PATH = {hcl_config_path} ->")
            print_file_contents(hcl_config_path)
            return

        print(f"MULTI_HLS_IPS={os.environ.get('MULTI_HLS_IPS')}")

        output_file_name = str(tmp_dir.joinpath(self.output_filename))
        self.mpirun_cmd = self.create_mpi_cmdline(output_file_name)

        if is_valid_multi_node_config():
            hcl_config_path = self.create_multi_hls_setup(tmp_dir)
        else:
            hcl_config_path = self.create_single_hls_setup(tmp_dir)

        # Printing env var HCL_CONFIG_PATH has been proven to be misleading.
        #print(f"HCL_CONFIG_PATH = {str(os.environ.get('HCL_CONFIG_PATH'))}")

        if hcl_config_path is not None:
            print(f"Effective HCL_CONFIG_PATH = {hcl_config_path} ->")
            print_file_contents(hcl_config_path)
        else:
            print(f"HCL Config is not used in this run.")

        print(f"{self.__class__.__name__} create_multi_worker_setup(): self.mpirun_cmd = {self.mpirun_cmd}")