def mpi_run(self): env = sagemaker_containers.training_env() print("MPI requested with process per hosts: {}" .format(self._num_of_processes_per_host)) _setup_mpi_environment(env) _create_mpi_script(env, self._train_script, self._train_script_args) mpi_master = MPIMaster(env, self._num_of_processes_per_host, self._instance_type) if mpi_master.is_master(env.hosts, env.current_host): print("Inside Master") mpi_master() else: print("Inside Worker") MPIWorker()(env)
logger.info("Gpu count: {}".format(torch.cuda.device_count())) model = nn.DataParallel(model) with open(os.path.join(model_dir, 'model.pth'), 'rb') as f: model.load_state_dict(torch.load(f)) return model.to(device) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--workers', type=int, default=2, metavar='W', help='number of data loading workers (default: 2)') parser.add_argument('--epochs', type=int, default=2, metavar='E', help='number of total epochs to run (default: 2)') parser.add_argument('--batch_size', type=int, default=4, metavar='BS', help='batch size (default: 4)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='initial learning rate (default: 0.001)') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)') parser.add_argument('--dist_backend', type=str, default='gloo', help='distributed backend (default: gloo)') env = sagemaker_containers.training_env() parser.add_argument('--hosts', type=list, default=env.hosts) parser.add_argument('--current-host', type=str, default=env.current_host) parser.add_argument('--model-dir', type=str, default=env.model_dir) parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs.get('training')) parser.add_argument('--num-gpus', type=int, default=env.num_gpus) _train(parser.parse_args())
if rgb_format: images = np.broadcast_to(images, (len(images), 3) + images.shape[2:]) elif ndim != 1: raise ValueError("invalid ndim for MNIST dataset") images = images.astype(image_dtype) images *= scale / 255.0 if withlabel: labels = raw["y"][-100:].astype(label_dtype) return tuple_dataset.TupleDataset(images, labels) return images if __name__ == "__main__": env = sagemaker_containers.training_env() parser = argparse.ArgumentParser() # Data and model checkpoints directories parser.add_argument("--epochs", type=int, default=1) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--communicator", type=str, default="pure_nccl") parser.add_argument("--frequency", type=int, default=20) parser.add_argument("--units", type=int, default=1000) parser.add_argument("--model-dir", type=str) parser.add_argument("--output-data-dir", type=str, default=env.output_data_dir) parser.add_argument("--host", type=str, default=env.current_host)
def test_create_training_env_without_training_files_and_directories_should_not_fail( ): training_env = sagemaker_containers.training_env() hostname = socket.gethostname() assert training_env.current_host == hostname assert training_env.hosts == [hostname]
def train(): intermediate_sync = None exit_code = SUCCESS_CODE try: # TODO: iquintero - add error handling for ImportError to let the user know # if the framework module is not defined. env = sagemaker_containers.training_env() # TODO: [issue#144] There is a bug in the logic - # we need os.environ.get(_params.REGION_NAME_ENV) # in certain regions, but it is not going to be available unless # TrainingEnvironment has been initialized. It shouldn't be environment variable. region = os.environ.get('AWS_REGION', os.environ.get(_params.REGION_NAME_ENV)) intermediate_sync = _intermediate_output.start_sync( env.sagemaker_s3_output(), region) if env.framework_module: framework_name, entry_point_name = env.framework_module.split(':') framework = importlib.import_module(framework_name) # the logger is configured after importing the framework library, allowing the framework to # configure logging at import time. _logging.configure_logger(env.log_level) logger.info('Imported framework %s', framework_name) entrypoint = getattr(framework, entry_point_name) entrypoint() else: _logging.configure_logger(env.log_level) mpi_enabled = env.additional_framework_parameters.get( _params.MPI_ENABLED) runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process entry_point.run(env.module_dir, env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), runner=runner_type) logger.info('Reporting training SUCCESS') _files.write_success_file() except _errors.ClientError as e: failure_message = str(e) _files.write_failure_file(failure_message) logger.error(failure_message) if intermediate_sync: intermediate_sync.join() exit_code = DEFAULT_FAILURE_CODE except Exception as e: failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(), str(e)) _files.write_failure_file(failure_msg) logger.error('Reporting training FAILURE') logger.error(failure_msg) exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE) finally: if intermediate_sync: intermediate_sync.join() _exit_processes(exit_code)
def train(): """Placeholder docstring""" intermediate_sync = None exit_code = SUCCESS_CODE try: env = sagemaker_containers.training_env() region = os.environ.get("AWS_REGION", os.environ.get(_params.REGION_NAME_ENV)) s3_endpoint_url = os.environ.get(_params.S3_ENDPOINT_URL, None) intermediate_sync = _intermediate_output.start_sync( env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url) if env.framework_module: framework_name, entry_point_name = env.framework_module.split(":") framework = importlib.import_module(framework_name) # the logger is configured after importing the framework library, allowing # the framework to configure logging at import time. _logging.configure_logger(env.log_level) logger.info("Imported framework %s", framework_name) entrypoint = getattr(framework, entry_point_name) entrypoint() else: _logging.configure_logger(env.log_level) mpi_enabled = env.additional_framework_parameters.get( _params.MPI_ENABLED) runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process entry_point.run( env.module_dir, env.user_entry_point, env.to_cmd_args(), env.to_env_vars(), runner=runner_type, ) logger.info("Reporting training SUCCESS") _files.write_success_file() except _errors.ClientError as e: failure_message = str(e) _files.write_failure_file(failure_message) logger.error(failure_message) if intermediate_sync: intermediate_sync.join() exit_code = DEFAULT_FAILURE_CODE except Exception as e: # pylint: disable=broad-except failure_msg = "framework error: \n%s\n%s" % (traceback.format_exc(), str(e)) _files.write_failure_file(failure_msg) logger.error("Reporting training FAILURE") logger.error(failure_msg) error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE) exit_code = _get_valid_failure_exit_code(error_number) finally: if intermediate_sync: intermediate_sync.join() _exit_processes(exit_code)
def execute_horovod_script(train_script, processes_per_host): print( "Starting Horovod training with Horovod train script: {} Num processes per host: {}" .format(train_script, processes_per_host)) env = sagemaker_containers.training_env() _horovod_run(env, processes_per_host, train_script)
def framework_training_with_run_modules_fn(capture_error): training_env = sagemaker_containers.training_env() modules.run_module(training_env.module_dir, training_env.to_cmd_args(), training_env.to_env_vars(), training_env.module_name, capture_error=capture_error)
def framework_training_with_script_mode_fn(capture_error): training_env = sagemaker_containers.training_env() entry_point.run(training_env.module_dir, training_env.user_entry_point, training_env.to_cmd_args(), training_env.to_env_vars(), capture_error=capture_error)
def execute_horovod_script(train_script, processes_per_host=1): env = sagemaker_containers.training_env() _horovod_run(env, processes_per_host, train_script)
def main(): train(training_env())