def mpi_run(self):
        env = sagemaker_containers.training_env()
        print("MPI requested with process per hosts: {}"
              .format(self._num_of_processes_per_host))

        _setup_mpi_environment(env)
        _create_mpi_script(env, self._train_script, self._train_script_args)

        mpi_master = MPIMaster(env, self._num_of_processes_per_host, self._instance_type)
        if mpi_master.is_master(env.hosts, env.current_host):
            print("Inside Master")
            mpi_master()
        else:
            print("Inside Worker")
            MPIWorker()(env)
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    with open(os.path.join(model_dir, 'model.pth'), 'rb') as f:
        model.load_state_dict(torch.load(f))
    return model.to(device)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument('--workers', type=int, default=2, metavar='W',
                        help='number of data loading workers (default: 2)')
    parser.add_argument('--epochs', type=int, default=2, metavar='E',
                        help='number of total epochs to run (default: 2)')
    parser.add_argument('--batch_size', type=int, default=4, metavar='BS',
                        help='batch size (default: 4)')
    parser.add_argument('--lr', type=float, default=0.001, metavar='LR',
                        help='initial learning rate (default: 0.001)')
    parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='momentum (default: 0.9)')
    parser.add_argument('--dist_backend', type=str, default='gloo', help='distributed backend (default: gloo)')

    env = sagemaker_containers.training_env()
    parser.add_argument('--hosts', type=list, default=env.hosts)
    parser.add_argument('--current-host', type=str, default=env.current_host)
    parser.add_argument('--model-dir', type=str, default=env.model_dir)
    parser.add_argument('--data-dir', type=str, default=env.channel_input_dirs.get('training'))
    parser.add_argument('--num-gpus', type=int, default=env.num_gpus)

    _train(parser.parse_args())
        if rgb_format:
            images = np.broadcast_to(images,
                                     (len(images), 3) + images.shape[2:])
    elif ndim != 1:
        raise ValueError("invalid ndim for MNIST dataset")
    images = images.astype(image_dtype)
    images *= scale / 255.0

    if withlabel:
        labels = raw["y"][-100:].astype(label_dtype)
        return tuple_dataset.TupleDataset(images, labels)
    return images


if __name__ == "__main__":
    env = sagemaker_containers.training_env()

    parser = argparse.ArgumentParser()

    # Data and model checkpoints directories
    parser.add_argument("--epochs", type=int, default=1)
    parser.add_argument("--batch-size", type=int, default=64)
    parser.add_argument("--communicator", type=str, default="pure_nccl")
    parser.add_argument("--frequency", type=int, default=20)
    parser.add_argument("--units", type=int, default=1000)

    parser.add_argument("--model-dir", type=str)
    parser.add_argument("--output-data-dir",
                        type=str,
                        default=env.output_data_dir)
    parser.add_argument("--host", type=str, default=env.current_host)
예제 #4
0
def test_create_training_env_without_training_files_and_directories_should_not_fail(
):
    training_env = sagemaker_containers.training_env()
    hostname = socket.gethostname()
    assert training_env.current_host == hostname
    assert training_env.hosts == [hostname]
예제 #5
0
def train():
    intermediate_sync = None
    exit_code = SUCCESS_CODE
    try:
        # TODO: iquintero - add error handling for ImportError to let the user know
        # if the framework module is not defined.
        env = sagemaker_containers.training_env()

        # TODO: [issue#144] There is a bug in the logic -
        # we need os.environ.get(_params.REGION_NAME_ENV)
        # in certain regions, but it is not going to be available unless
        # TrainingEnvironment has been initialized. It shouldn't be environment variable.
        region = os.environ.get('AWS_REGION',
                                os.environ.get(_params.REGION_NAME_ENV))
        intermediate_sync = _intermediate_output.start_sync(
            env.sagemaker_s3_output(), region)

        if env.framework_module:
            framework_name, entry_point_name = env.framework_module.split(':')

            framework = importlib.import_module(framework_name)

            # the logger is configured after importing the framework library, allowing the framework to
            # configure logging at import time.
            _logging.configure_logger(env.log_level)
            logger.info('Imported framework %s', framework_name)

            entrypoint = getattr(framework, entry_point_name)
            entrypoint()
        else:
            _logging.configure_logger(env.log_level)

            mpi_enabled = env.additional_framework_parameters.get(
                _params.MPI_ENABLED)
            runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process

            entry_point.run(env.module_dir,
                            env.user_entry_point,
                            env.to_cmd_args(),
                            env.to_env_vars(),
                            runner=runner_type)

        logger.info('Reporting training SUCCESS')

        _files.write_success_file()
    except _errors.ClientError as e:

        failure_message = str(e)
        _files.write_failure_file(failure_message)

        logger.error(failure_message)

        if intermediate_sync:
            intermediate_sync.join()

        exit_code = DEFAULT_FAILURE_CODE
    except Exception as e:
        failure_msg = 'framework error: \n%s\n%s' % (traceback.format_exc(),
                                                     str(e))

        _files.write_failure_file(failure_msg)
        logger.error('Reporting training FAILURE')

        logger.error(failure_msg)

        exit_code = getattr(e, 'errno', DEFAULT_FAILURE_CODE)
    finally:
        if intermediate_sync:
            intermediate_sync.join()

        _exit_processes(exit_code)
예제 #6
0
def train():
    """Placeholder docstring"""
    intermediate_sync = None
    exit_code = SUCCESS_CODE
    try:
        env = sagemaker_containers.training_env()

        region = os.environ.get("AWS_REGION",
                                os.environ.get(_params.REGION_NAME_ENV))
        s3_endpoint_url = os.environ.get(_params.S3_ENDPOINT_URL, None)
        intermediate_sync = _intermediate_output.start_sync(
            env.sagemaker_s3_output(), region, endpoint_url=s3_endpoint_url)

        if env.framework_module:
            framework_name, entry_point_name = env.framework_module.split(":")

            framework = importlib.import_module(framework_name)

            # the logger is configured after importing the framework library, allowing
            # the framework to configure logging at import time.
            _logging.configure_logger(env.log_level)
            logger.info("Imported framework %s", framework_name)

            entrypoint = getattr(framework, entry_point_name)
            entrypoint()
        else:
            _logging.configure_logger(env.log_level)

            mpi_enabled = env.additional_framework_parameters.get(
                _params.MPI_ENABLED)
            runner_type = _runner.RunnerType.MPI if mpi_enabled else _runner.RunnerType.Process

            entry_point.run(
                env.module_dir,
                env.user_entry_point,
                env.to_cmd_args(),
                env.to_env_vars(),
                runner=runner_type,
            )

        logger.info("Reporting training SUCCESS")

        _files.write_success_file()
    except _errors.ClientError as e:

        failure_message = str(e)
        _files.write_failure_file(failure_message)

        logger.error(failure_message)

        if intermediate_sync:
            intermediate_sync.join()

        exit_code = DEFAULT_FAILURE_CODE
    except Exception as e:  # pylint: disable=broad-except
        failure_msg = "framework error: \n%s\n%s" % (traceback.format_exc(),
                                                     str(e))

        _files.write_failure_file(failure_msg)
        logger.error("Reporting training FAILURE")

        logger.error(failure_msg)

        error_number = getattr(e, "errno", DEFAULT_FAILURE_CODE)
        exit_code = _get_valid_failure_exit_code(error_number)
    finally:
        if intermediate_sync:
            intermediate_sync.join()

        _exit_processes(exit_code)
예제 #7
0
def execute_horovod_script(train_script, processes_per_host):
    print(
        "Starting Horovod training with Horovod train script: {} Num processes per host: {}"
        .format(train_script, processes_per_host))
    env = sagemaker_containers.training_env()
    _horovod_run(env, processes_per_host, train_script)
예제 #8
0
def framework_training_with_run_modules_fn(capture_error):
    training_env = sagemaker_containers.training_env()

    modules.run_module(training_env.module_dir, training_env.to_cmd_args(),
                       training_env.to_env_vars(), training_env.module_name,
                       capture_error=capture_error)
예제 #9
0
def framework_training_with_script_mode_fn(capture_error):
    training_env = sagemaker_containers.training_env()

    entry_point.run(training_env.module_dir, training_env.user_entry_point, training_env.to_cmd_args(),
                    training_env.to_env_vars(), capture_error=capture_error)
def execute_horovod_script(train_script, processes_per_host=1):
    env = sagemaker_containers.training_env()
    _horovod_run(env, processes_per_host, train_script)
예제 #11
0
def main():
    train(training_env())