def test_mnist_cpu(docker_image, opt_ml, use_gpu): local_mode.train(mnist_script, data_dir, docker_image, opt_ml, use_gpu=use_gpu) assert local_mode.file_exists( opt_ml, 'model/model.pth'), 'Model file was not created' assert local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
def test_mnist(docker_image, opt_ml, processor): local_mode.train(mnist_script, data_dir, docker_image, opt_ml, hyperparameters={'processor': processor}) assert local_mode.file_exists( opt_ml, 'model/model.pth'), 'Model file was not created' assert local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
def test_dist_operations_path_cpu(docker_image, opt_ml, dist_cpu_backend): local_mode.train(dist_operations_path, data_dir, docker_image, opt_ml, cluster_size=3, hyperparameters={'backend': dist_cpu_backend}) assert local_mode.file_exists( opt_ml, 'model/success'), 'Script success file was not created' assert local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
def test_cpu_nccl(docker_image, opt_ml): local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2, hyperparameters={'backend': 'nccl'}) assert not local_mode.file_exists( opt_ml, 'model/success'), 'Script success file was not created' assert not local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure not happened'
def test_mnist_cpu(docker_image, opt_ml, dist_cpu_backend): local_mode.train(mnist_script, data_dir, docker_image, opt_ml, cluster_size=2, hyperparameters={'backend': dist_cpu_backend}) assert local_mode.file_exists( opt_ml, 'model/model.pth'), 'Model file was not created' assert local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
def test_single_machine_failure(docker_image, opt_ml, use_gpu): customer_script = 'failure_script.py' local_mode.train(customer_script, resource_path, docker_image, opt_ml, source_dir=resource_path, use_gpu=use_gpu) assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
def test_xgboost_training_multiple_machines_without_early_stopping( docker_image, opt_ml): hyperparameters = get_default_hyperparameters(100000) hyperparameters['save_model_on_termination'] = 'false' local_mode.train(False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=2, early_stopping=True) host1 = local_mode.file_exists(opt_ml, "model/xgboost-model", 'algo-1') host2 = local_mode.file_exists(opt_ml, "model/xgboost-model", 'algo-2') assert not (host1 or host2), "Model saved on some host"
def test_xgboost_boston_single_machine(docker_image, opt_ml): customer_script = 'single_machine_customer_script.py' hyperparameters = { 'objective': 'reg:linear', 'colsample-bytree': 0.3, 'learning-rate': 0.1, 'max-depth': 5, 'reg-alpha': 10, 'n-estimators': 10 } local_mode.train(customer_script, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=boston_path) files = [ 'model/xgb-boston.model', 'output/data/cv_results.csv', 'output/data/feature-importance-plot.png' ] assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened' test_utils.files_exist(opt_ml, files)
def test_fastai_mnist(docker_image, opt_ml, use_gpu, py_version): if py_version != PYTHON3: print('Skipping the test because fastai supports >= Python 3.6.') return local_mode.train(fastai_mnist_script, os.path.join(fastai_path, 'mnist_tiny'), docker_image, opt_ml, use_gpu=use_gpu) assert local_mode.file_exists( opt_ml, 'model/model.pth'), 'Model file was not created' assert local_mode.file_exists( opt_ml, 'output/success'), 'Success file was not created' assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened'
def test_distributed_failure(docker_image, opt_ml, use_gpu): customer_script = 'failure_script.py' cluster_size = 2 hyperparameters = {'sagemaker_process_slots_per_host': 1, 'sagemaker_num_processes': cluster_size, 'node_to_fail': 1} local_mode.train(customer_script, resource_path, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=resource_path, use_gpu=use_gpu, cluster_size=cluster_size) assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
def test_xgboost_abalone_kfold(dataset, extra_hps, model_file_count, docker_image, opt_ml): hyperparameters = get_abalone_default_hyperparameters() data_path = os.path.join(data_root, dataset, "data") local_mode.train( False, data_path, docker_image, opt_ml, hyperparameters={ **hyperparameters, **extra_hps }, ) files = [f"model/xgboost-model-{i}" for i in range(model_file_count)] assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened" test_utils.files_exist(opt_ml, files) local_mode.file_exists(opt_ml, "output/data/predictions.csv")
def test_chainer_mnist_single_machine(docker_image, opt_ml, use_gpu): customer_script = 'single_machine_customer_script.py' hyperparameters = {'batch-size': 10000, 'epochs': 1} local_mode.train(customer_script, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=mnist_path, use_gpu=use_gpu) files = [ 'model/model.npz', 'output/success', 'output/data/algo-1/accuracy.png', 'output/data/algo-1/cg.dot', 'output/data/algo-1/log', 'output/data/algo-1/loss.png' ] test_utils.files_exist(opt_ml, files) assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened' script_path = os.path.join(mnist_path, customer_script) with local_mode.serve(script_path, model_dir=None, image_name=docker_image, opt_ml=opt_ml, use_gpu=use_gpu, source_dir=mnist_path): test_arrays = [ np.zeros((100, 784), dtype='float32'), np.zeros((100, 1, 28, 28), dtype='float32'), np.zeros((100, 28, 28), dtype='float32') ] request_data = np.zeros((100, 784), dtype='float32') data_as_list = request_data.tolist() test_utils.predict_and_assert_response_length(data_as_list, 'text/csv') for array in test_arrays: # JSON and NPY can take multidimensional (n > 2) arrays data_as_list = array.tolist() test_utils.predict_and_assert_response_length( data_as_list, 'application/json') test_utils.predict_and_assert_response_length( request_data, 'application/x-npy')
def test_package_version(docker_image, opt_ml): version_check_script = "train.py" local_mode.train( version_check_script, data_dir, docker_image, opt_ml, source_dir=script_path, ) assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened"
def test_xgboost_training_single_machine_without_early_stopping( docker_image, opt_ml): hyperparameters = get_default_hyperparameters(100000) hyperparameters['save_model_on_termination'] = 'false' local_mode.train(False, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, early_stopping=True, train_time=5) assert not local_mode.file_exists(opt_ml, "model/xgboost-model"), "Model saved"
def test_training_jobs_do_not_stall(docker_image, opt_ml, use_gpu): """ This test validates that training does not stall. https://github.com/chainer/chainermn/issues/236 """ customer_script = 'training_jobs_do_not_stall_customer_script.py' cluster_size = 2 hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 1, 'sagemaker_num_processes': 2} local_mode.train(customer_script, resource_path, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=resource_path, use_gpu=use_gpu, cluster_size=cluster_size) assert local_mode.file_exists(opt_ml, 'output/failure'), 'Failure did not happen'
def test_xgboost_abalone_inference(docker_image, opt_ml): customer_script = "abalone_distributed.py" request_body = get_libsvm_request_body() with local_mode.serve(customer_script, libsvm_model_dir, docker_image, opt_ml, source_dir=abalone_path): response_status_code, response_body = local_mode.request( request_body, content_type="text/libsvm") assert response_status_code == 200 assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened" assert len(response_body.split(",")) == 1
def test_chainer_mnist_distributed(docker_image, opt_ml, use_gpu, customer_script): cluster_size = 2 # pure_nccl communicator hangs when only one gpu is available. hyperparameters = { 'sagemaker_process_slots_per_host': 1, 'sagemaker_num_processes': cluster_size, 'batch-size': 10000, 'epochs': 1, 'communicator': 'hierarchical' } local_mode.train(customer_script, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, cluster_size=cluster_size, source_dir=mnist_path, use_gpu=use_gpu) files = [ 'model/model.npz', 'output/success', 'output/data/algo-1/accuracy.png', 'output/data/algo-1/cg.dot', 'output/data/algo-1/log', 'output/data/algo-1/loss.png' ] test_utils.files_exist(opt_ml, files) assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened' with local_mode.serve(os.path.join(mnist_path, customer_script), model_dir=None, image_name=docker_image, opt_ml=opt_ml): request_data = np.zeros((100, 784), dtype='float32') data_as_list = request_data.tolist() test_utils.predict_and_assert_response_length(data_as_list, 'application/json') test_utils.predict_and_assert_response_length(data_as_list, 'text/csv') test_utils.predict_and_assert_response_length(request_data, 'application/x-npy')
def test_all_processes_finish_with_mpi(docker_image, opt_ml, use_gpu): """ This test validates that all training processes finish before containers are shut down. """ customer_script = 'all_processes_finish_customer_script.py' cluster_size = 2 hyperparameters = {'sagemaker_use_mpi': True, 'sagemaker_process_slots_per_host': 2, 'sagemaker_num_processes': 4} local_mode.train(customer_script, resource_path, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=resource_path, use_gpu=use_gpu, cluster_size=cluster_size) file_name = 'output/data/process_could_complete' assert local_mode.file_exists(opt_ml, file_name, host='algo-2'), 'Model was not saved'
def test_xgboost_abalone_training_single_machine(docker_image, opt_ml): customer_script = "abalone_distributed.py" hyperparameters = get_abalone_default_hyperparameters() local_mode.train( customer_script, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=abalone_path, ) files = ["model/xgboost-model"] assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened" test_utils.files_exist(opt_ml, files)
def test_xgboost_abalone_mme_with_transform_fn(docker_image, opt_ml): customer_script = "abalone_distributed.py" request_body = get_libsvm_request_body() additional_env_vars = [ "SAGEMAKER_BIND_TO_PORT=8080", "SAGEMAKER_SAFE_PORT_RANGE=9000-9999", "SAGEMAKER_MULTI_MODEL=true", ] model_name = "libsvm_pickled" model_data = json.dumps({ "model_name": model_name, "url": "/opt/ml/model/{}".format(model_name) }) with append_transform_fn_to_abalone_script( abalone_path, customer_script) as custom_script_path: with local_mode.serve( customer_script, models_dir, docker_image, opt_ml, source_dir=custom_script_path, additional_env_vars=additional_env_vars, ): load_status_code, _ = local_mode.request( model_data, content_type="application/json", request_url=MME_MODELS_URL.format(model_name), ) assert load_status_code == 200 invoke_status_code, invoke_response_body = local_mode.request( request_body, content_type="text/libsvm", request_url=MME_INVOKE_URL.format(model_name), ) assert invoke_status_code == 200 assert (len(invoke_response_body.split(",")) == len(request_body.split()) + 1 # final column is the bias term ) assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened"
def test_xgboost_abalone_custom_inference_with_transform_fn( docker_image, opt_ml): customer_script = "abalone_distributed.py" request_body = get_libsvm_request_body() with append_transform_fn_to_abalone_script( abalone_path, customer_script) as custom_script_path: with local_mode.serve( customer_script, libsvm_model_dir, docker_image, opt_ml, source_dir=custom_script_path, ): response_status_code, response_body = local_mode.request( request_body, content_type="text/libsvm") assert response_status_code == 200 assert not local_mode.file_exists(opt_ml, "output/failure"), "Failure happened" assert (len(response_body.split(",")) == len(request_body.split()) + 1 # final column is the bias term )
def test_chainer_mnist_custom_loop(docker_image, opt_ml, use_gpu): customer_script = 'single_machine_custom_loop.py' hyperparameters = {'batch-size': 10000, 'epochs': 1} local_mode.train(customer_script, data_dir, docker_image, opt_ml, hyperparameters=hyperparameters, source_dir=mnist_path, use_gpu=use_gpu) files = ['model/model.npz', 'output/success'] test_utils.files_exist(opt_ml, files) assert not local_mode.file_exists(opt_ml, 'output/failure'), 'Failure happened' script_path = os.path.join(mnist_path, customer_script) with local_mode.serve(script_path, model_dir=None, image_name=docker_image, opt_ml=opt_ml): request_data = np.zeros((100, 784), dtype='float32') data_as_list = request_data.tolist() test_utils.predict_and_assert_response_length(data_as_list, 'application/json') test_utils.predict_and_assert_response_length(data_as_list, 'text/csv') test_utils.predict_and_assert_response_length(request_data, 'application/x-npy')
def files_exist(opt_ml, files): for f in files: assert localmode.file_exists(opt_ml, f), 'file {} was not created'.format(f)