def train(hosts, current_host, num_gpus, custom_mpi_cmds): hyperparameters = framework.env.read_hyperparameters() env = framework.training_env(hyperparameters=hyperparameters) process_slots_per_host = num_gpus _start_ssh_daemon() # Remove the conflict MPI setting subprocess.check_call("sed -ie \"s/btl_tcp_if_exclude/#btl_tcp_if_exclude/g\" /usr/local/etc/openmpi-mca-params.conf", shell=True) if current_host == hosts[0]: host_list = hosts if process_slots_per_host == 1 else \ [host + ':{}'.format(process_slots_per_host) for host in hosts] num_processes = process_slots_per_host * len(hosts) credential_vars = ['AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN'] # Build mpirun file mpi_command = [ '#!/usr/bin/env bash \n', '/usr/local/bin/mpirun --allow-run-as-root --display-map --tag-output --host {} \\\n'.format(",".join(host_list)), \ ' --mca plm_rsh_no_tree_spawn 1 \\\n', \ ' -mca pml ob1 \\\n', \ ' -mca btl ^openib \\\n', \ ' -bind-to None \\\n', \ ' -map-by slot \\\n', \ ' -mca btl_vader_single_copy_mechanism none \\\n' ' -mca btl_tcp_if_include {} \\\n'.format(env.network_interface_name), \ ' -mca oob_tcp_if_include {} \\\n'.format(env.network_interface_name), \ ' -x NCCL_SOCKET_IFNAME={} \\\n'.format(env.network_interface_name), \ ' -x NCCL_MIN_NRINGS=8 \\\n', \ ' -x HOROVOD_CYCLE_TIME=0.5 \\\n', \ ' -x TF_CUDNN_USE_AUTOTUNE=0 \\\n', \ ' -x HOROVOD_FUSION_THRESHOLD=67108864 \\\n', \ ' -x TENSORPACK_FP16=1 \\\n', \ ' -x PATH \\\n', \ ' -x LD_LIBRARY_PATH \\\n', \ ' -x NCCL_DEBUG=INFO \\\n', \ ' -mca orte_abort_on_non_zero_status 1 \\\n', \ ' -np {} \\\n'.format(num_processes)] for v in credential_vars: if v in os.environ: mpi_command.append(" -x {} \\\n".format(v)) for cmd in custom_mpi_cmds: mpi_command.append("{} \\\n".format(cmd)) mpi_command.append("/opt/ml/code/run.sh") # Write file and lanch mpi with open('mpi_cmd.sh', 'a') as the_file: for item in mpi_command: the_file.write(item) with open('mpi_cmd.sh', 'r') as the_file: logger.info('MPI script:\n\n%s', the_file.read()) subprocess.check_call("chmod +x mpi_cmd.sh", shell=True) _wait_for_worker_nodes_to_start_sshd(hosts) subprocess.check_call("./mpi_cmd.sh", shell=True) else: _wait_master_to_start(hosts[0]) _wait_master_to_finish(hosts[0])
def main(): """Training entry point """ hyperparameters = framework.env.read_hyperparameters() env = framework.training_env(hyperparameters=hyperparameters) user_hyperparameters = env.hyperparameters # If the training job is part of the multiple training jobs for tuning, we need to append the training job name to # model_dir in case they read from/write to the same object if '_tuning_objective_metric' in hyperparameters: model_dir = _model_dir_with_training_job( hyperparameters.get('model_dir'), env.job_name) logger.info('Appending the training job name to model_dir: {}'.format( model_dir)) user_hyperparameters['model_dir'] = model_dir s3_utils.configure(user_hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION')) train(env, framework.mapping.to_cmd_args(user_hyperparameters)) _log_model_missing_warning(MODEL_DIR)
def main(): train(framework.training_env()) sys.exit(0)
def main(): train(framework.training_env())
def main(): hyperparameters = framework.env.read_hyperparameters() env = framework.training_env(hyperparameters=hyperparameters) logger.setLevel(env.log_level) train(env, hyperparameters)
def test_env_vars_round_trip(): hyperparameters = { "loss": "SGD", "sagemaker_program": "user_script.py", "epochs": 10, "batch_size": 64, "precision": 5.434322, "sagemaker_region": "us-west-2", "sagemaker_job_name": "horovod-training-job", "sagemaker_submit_directory": "s3/something", } resource_config = { "current_host": "algo-1", "hosts": ["algo-1", "algo-2", "algo-3"] } input_data_config = { "train": { "ContentType": "trainingContentType", "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None", }, "validation": { "TrainingInputMode": "File", "S3DistributionType": "FullyReplicated", "RecordWrapperType": "None", }, } os.environ[ framework.params. FRAMEWORK_TRAINING_MODULE_ENV] = "test.functional.simple_framework:train" training_env = framework.training_env( resource_config=resource_config, input_data_config=input_data_config, hyperparameters=hyperparameters, ) os.environ[framework.params.FRAMEWORK_TRAINING_MODULE_ENV] = "" args = framework.mapping.to_cmd_args(training_env.hyperparameters) env_vars = training_env.to_env_vars() env_vars["SM_USER_ARGS"] = " ".join(args) assert env_vars["SM_OUTPUT_DATA_DIR"] == training_env.output_data_dir assert ( env_vars["SM_INPUT_DATA_CONFIG"] == '{"train":{"ContentType":"trainingContentType",' '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",' '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",' '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}') assert env_vars["SM_NETWORK_INTERFACE_NAME"] == "eth0" assert env_vars["SM_LOG_LEVEL"] == "20" assert env_vars["SM_INPUT_DIR"].endswith("/opt/ml/input") assert env_vars["SM_NUM_CPUS"] == str(training_env.num_cpus) assert env_vars["SM_HP_BATCH_SIZE"] == "64" assert env_vars["SM_CHANNEL_TRAIN"].endswith("/opt/ml/input/data/train") assert env_vars["SM_CHANNEL_VALIDATION"].endswith( "/opt/ml/input/data/validation") assert env_vars["SM_HP_EPOCHS"] == "10" assert env_vars[ "SM_HPS"] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}' assert env_vars["SM_HP_PRECISION"] == "5.434322" assert (env_vars["SM_RESOURCE_CONFIG"] == '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}') assert env_vars["SM_MODULE_NAME"] == "user_script" assert env_vars["SM_INPUT_CONFIG_DIR"].endswith("/opt/ml/input/config") assert env_vars[ "SM_USER_ARGS"] == "--batch_size 64 --epochs 10 --loss SGD --precision 5.434322" assert env_vars["SM_OUTPUT_DIR"].endswith("/opt/ml/output") assert env_vars["SM_MODEL_DIR"].endswith("/opt/ml/model") assert env_vars["SM_HOSTS"] == '["algo-1","algo-2","algo-3"]' assert env_vars["SM_NUM_GPUS"] == str(training_env.num_gpus) assert env_vars["SM_MODULE_DIR"] == "s3/something" assert env_vars["SM_CURRENT_HOST"] == "algo-1" assert env_vars["SM_CHANNELS"] == '["train","validation"]' assert env_vars["SM_HP_LOSS"] == "SGD" assert env_vars[ "SM_FRAMEWORK_MODULE"] == "test.functional.simple_framework:train" assert all(x in env_vars["SM_TRAINING_ENV"] for x in (training_env.properties()))
def train(hosts, current_host, num_gpus, custom_mpi_cmds): hyperparameters = framework.env.read_hyperparameters() env = framework.training_env(hyperparameters=hyperparameters) process_slots_per_host = num_gpus # Data Preprocessing print("Download pre-trained model....") subprocess.check_call("mkdir -p /opt/ml/code/data/pretrained-models", shell=True) subprocess.check_call( "wget http://models.tensorpack.com/FasterRCNN/ImageNet-R50-AlignPadding.npz", shell=True) subprocess.check_call( "cp ImageNet-R50-AlignPadding.npz data/pretrained-models", shell=True) print("Loading data from s3......") subprocess.check_call( "aws s3 cp s3://armand-ajay-workshop/mask-rcnn/sagemaker/input/train /opt/ml/code/data --recursive --quiet", shell=True) print("Loading data finsihed...Install tensorpack....") subprocess.check_call( "git clone https://github.com/armandmcqueen/tensorpack-mask-rcnn /opt/ml/code/tensorpack-mask-rcnn", shell=True) subprocess.check_call("chmod -R +w /opt/ml/code/tensorpack-mask-rcnn", shell=True) subprocess.check_call( "pip install --ignore-installed -e /opt/ml/code/tensorpack-mask-rcnn/", shell=True) subprocess.check_call("chmod +x /opt/ml/code/run.sh", shell=True) print("Tensorpack install finished...") _start_ssh_daemon() # Remove the conflict MPI setting subprocess.check_call( "sed -ie \"s/btl_tcp_if_exclude/#btl_tcp_if_exclude/g\" /usr/local/etc/openmpi-mca-params.conf", shell=True) if current_host == hosts[0]: host_list = hosts if process_slots_per_host == 1 else \ [host + ':{}'.format(process_slots_per_host) for host in hosts] num_processes = process_slots_per_host * len(hosts) credential_vars = [ 'AWS_ACCESS_KEY_ID', 'AWS_SECRET_ACCESS_KEY', 'AWS_SESSION_TOKEN' ] # Build mpirun file mpi_command = [ '#!/usr/bin/env bash \n', '/usr/local/bin/mpirun --allow-run-as-root --display-map --tag-output --host {} \\\n'.format(",".join(host_list)), \ ' --mca plm_rsh_no_tree_spawn 1 \\\n', \ ' -mca pml ob1 \\\n', \ ' -mca btl ^openib \\\n', \ ' -bind-to None \\\n', \ ' -map-by slot \\\n', \ ' -mca btl_vader_single_copy_mechanism none \\\n' ' -mca btl_tcp_if_include {} \\\n'.format(env.network_interface_name), \ ' -mca oob_tcp_if_include {} \\\n'.format(env.network_interface_name), \ ' -x NCCL_SOCKET_IFNAME={} \\\n'.format(env.network_interface_name), \ ' -x NCCL_MIN_NRINGS=8 \\\n', \ ' -x HOROVOD_CYCLE_TIME=0.5 \\\n', \ ' -x HOROVOD_FUSION_THRESHOLD=67108864 \\\n', \ ' -x TENSORPACK_FP16=1 \\\n', \ ' -x PATH \\\n', \ ' -x LD_LIBRARY_PATH \\\n', \ ' -x NCCL_DEBUG=INFO \\\n', \ ' -mca orte_abort_on_non_zero_status 1 \\\n', \ ' -np {} \\\n'.format(num_processes)] for v in credential_vars: if v in os.environ: mpi_command.append(" -x {} \\\n".format(v)) for cmd in custom_mpi_cmds: mpi_command.append("{} \\\n".format(cmd)) mpi_command.append("/opt/ml/code/run.sh") # Write file and lanch mpi with open('mpi_cmd.sh', 'a') as the_file: for item in mpi_command: the_file.write(item) with open('mpi_cmd.sh', 'r') as the_file: logger.info('MPI script:\n\n%s', the_file.read()) subprocess.check_call("chmod +x mpi_cmd.sh", shell=True) _wait_for_worker_nodes_to_start_sshd(hosts) subprocess.check_call("./mpi_cmd.sh", shell=True) else: _wait_master_to_start(hosts[0]) _wait_master_to_finish(hosts[0])
def test_env_vars_round_trip(): hyperparameters = { 'loss': 'SGD', 'sagemaker_program': 'user_script.py', 'epochs': 10, 'batch_size': 64, 'precision': 5.434322, 'sagemaker_region': 'us-west-2', 'sagemaker_job_name': 'horovod-training-job', 'sagemaker_submit_directory': 's3/something' } resource_config = { 'current_host': 'algo-1', 'hosts': ['algo-1', 'algo-2', 'algo-3'] } input_data_config = { 'train': { 'ContentType': 'trainingContentType', 'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None' }, 'validation': { 'TrainingInputMode': 'File', 'S3DistributionType': 'FullyReplicated', 'RecordWrapperType': 'None' } } os.environ[ framework.params. FRAMEWORK_TRAINING_MODULE_ENV] = 'test.functional.simple_framework:train' training_env = framework.training_env(resource_config=resource_config, input_data_config=input_data_config, hyperparameters=hyperparameters) os.environ[framework.params.FRAMEWORK_TRAINING_MODULE_ENV] = '' args = framework.mapping.to_cmd_args(training_env.hyperparameters) env_vars = training_env.to_env_vars() env_vars['SM_USER_ARGS'] = ' '.join(args) assert env_vars['SM_OUTPUT_DATA_DIR'] == training_env.output_data_dir assert env_vars['SM_INPUT_DATA_CONFIG'] == '{"train":{"ContentType":"trainingContentType",' \ '"RecordWrapperType":"None","S3DistributionType":"FullyReplicated",' \ '"TrainingInputMode":"File"},"validation":{"RecordWrapperType":"None",' \ '"S3DistributionType":"FullyReplicated","TrainingInputMode":"File"}}' assert env_vars['SM_NETWORK_INTERFACE_NAME'] == 'ethwe' assert env_vars['SM_LOG_LEVEL'] == '20' assert env_vars['SM_INPUT_DIR'].endswith('/opt/ml/input') assert env_vars['SM_NUM_CPUS'] == str(training_env.num_cpus) assert env_vars['SM_HP_BATCH_SIZE'] == '64' assert env_vars['SM_CHANNEL_TRAIN'].endswith('/opt/ml/input/data/train') assert env_vars['SM_CHANNEL_VALIDATION'].endswith( '/opt/ml/input/data/validation') assert env_vars['SM_HP_EPOCHS'] == '10' assert env_vars[ 'SM_HPS'] == '{"batch_size":64,"epochs":10,"loss":"SGD","precision":5.434322}' assert env_vars['SM_HP_PRECISION'] == '5.434322' assert env_vars[ 'SM_RESOURCE_CONFIG'] == '{"current_host":"algo-1","hosts":["algo-1","algo-2","algo-3"]}' assert env_vars['SM_MODULE_NAME'] == 'user_script' assert env_vars['SM_INPUT_CONFIG_DIR'].endswith('/opt/ml/input/config') assert env_vars[ 'SM_USER_ARGS'] == '--batch_size 64 --epochs 10 --loss SGD --precision 5.434322' assert env_vars['SM_OUTPUT_DIR'].endswith('/opt/ml/output') assert env_vars['SM_MODEL_DIR'].endswith('/opt/ml/model') assert env_vars['SM_HOSTS'] == '["algo-1","algo-2","algo-3"]' assert env_vars['SM_NUM_GPUS'] == str(training_env.num_gpus) assert env_vars['SM_MODULE_DIR'] == 's3/something' assert env_vars['SM_CURRENT_HOST'] == 'algo-1' assert env_vars['SM_CHANNELS'] == '["train","validation"]' assert env_vars['SM_HP_LOSS'] == 'SGD' assert env_vars[ 'SM_FRAMEWORK_MODULE'] == 'test.functional.simple_framework:train' assert all(x in env_vars['SM_TRAINING_ENV'] for x in (training_env.properties()))
def cli(program, args): hyperparameters = framework.env.read_hyperparameters() env = framework.training_env(hyperparameters=hyperparameters) logger.setLevel(env.log_level) train(env, hyperparameters, program, args)